1.Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

2.Loading datasets

In [None]:
df_train=pd.read_csv('hacktrain.csv')
df_test=pd.read_csv('hacktest.csv')

3.Missing values handling

a)Removing unnecessary columns like ID columns (unnecessary for model training)

In [None]:
df_train_n=df_train.iloc[:,2:]
df_test_n=df_test.iloc[:,2:]


b)Checking for missing values

In [None]:
print(df_train_n.isna().sum())
print(df_test_n.isna().sum())


c)Checking for outliers

In [None]:
#Training data
plt.figure(figsize=(14,8))
df_train_n.iloc[:,1:].boxplot(rot=90)
plt.title("Box plot for training data")
plt.xlabel("Columns")
plt.ylabel("Values")
plt.show()

#Testing data
plt.figure(figsize=(14,8))
df_test_n.iloc[:,:].boxplot(rot=90)
plt.title("Box plot for testing data")
plt.xlabel("Columns")
plt.ylabel("Values")
plt.show()

d)Imputation (Filling missing values with appropriate statistical values)

In [None]:
s_imp_train=SimpleImputer(missing_values=np.nan,strategy='median') #Some columns have outliers, so median is more suitable for continuous variables in training and testing datasets
s_imp_test=SimpleImputer(missing_values=np.nan,strategy='median')
df_train_n.iloc[:,1:]=s_imp_train.fit_transform(df_train_n.iloc[:,1:])
df_test_n.iloc[:,:]=s_imp_test.fit_transform(df_test_n.iloc[:,:])
print(df_train_n.isna().sum())
print(df_test_n.isna().sum())

4.Feature Scaling (here standardization)

In [None]:
#Training data
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
df_train_n.iloc[:,1:]=sc.fit_transform(df_train_n.iloc[:,1:])

#Testing data
df_test_n.iloc[:,:]=sc.transform(df_test_n.iloc[:,:])
print(df_test_n.head())

5.Splitting dataset into independent(predictors) and dependent(label) variables

In [15]:
X=df_train_n.iloc[:,1:]
y=df_train_n.iloc[:,0]

6.Model training and testing

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model=LogisticRegression(penalty="l1",C=0.5,solver="saga",max_iter=300)
lr_model.fit(X,y)
y_pred=lr_model.predict(df_test_n)
y_pred=pd.DataFrame(y_pred,columns=['Class'])
y_pred.insert(0, 'ID', range(1, len(y_pred) + 1))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(y_pred)