In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load Training Data

In [2]:
train_df = pd.read_csv('Datasets/training.csv', delimiter=';')
train_df.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,a,1792,0.00054,u,g,c,v,175,f,t,1,t,g,80.0,5,800000.0,t,0,no.
1,b,1692,0.00335,y,p,k,v,29,f,f,0,f,s,200.0,0,2000000.0,,0,no.
2,b,3125,1125.0,u,g,ff,ff,0,f,t,1,f,g,96.0,19,960000.0,t,0,no.
3,a,4817,1335.0,u,g,i,o,335,f,f,0,f,g,0.0,120,0.0,,0,no.
4,b,3233,35.0,u,g,k,v,5,f,f,0,t,g,232.0,0,2320000.0,f,0,no.


# Clean & Prepare Data

In [3]:
# Correct format of numbers
train_df['classLabel'] = train_df['classLabel'].map({'no.': 'no', 'yes.': 'yes'})
train_df['variable2'] = train_df['variable2'].replace({',': '.'},regex=True)
train_df['variable3'] = train_df['variable3'].replace({',': '.'},regex=True)
train_df['variable8'] = train_df['variable8'].replace({',': '.'},regex=True)
# Correct data types of columns
train_df['variable2'] = train_df['variable2'].astype('float64')
train_df['variable3'] = train_df['variable3'].astype('float64')
train_df['variable8'] = train_df['variable8'].astype('float64')

In [4]:
# Convert Categorical columns to its numerical codes
train_df['variable1'] = train_df['variable1'].astype("category").cat.codes
train_df['variable4'] = train_df['variable4'].astype("category").cat.codes
train_df['variable5'] = train_df['variable5'].astype("category").cat.codes
train_df['variable6'] = train_df['variable6'].astype("category").cat.codes
train_df['variable7'] = train_df['variable7'].astype("category").cat.codes
train_df['variable9'] = train_df['variable4'].astype("category").cat.codes
train_df['variable10'] = train_df['variable10'].astype("category").cat.codes
train_df['variable12'] = train_df['variable12'].astype("category").cat.codes
train_df['variable13'] = train_df['variable13'].astype("category").cat.codes

train_df['classLabel'] = train_df['classLabel'].astype("category").cat.codes

In [5]:
# Since varible18 has 2145 missing value and that approx. is 0.6 of the data set. Therefore we can neglect it
train_df.drop('variable18', axis=1, inplace=True)
# Drop NaN values from columns of type string
train_df = train_df.dropna(subset=['variable1', 'variable4', 'variable5', 'variable6', 'variable7'])
train_df.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable19,classLabel
0,0,17.92,5.4e-05,1,0,2,7,1.75,2,1,1,1,0,80.0,5,800000.0,0,0
1,1,16.92,3.4e-05,2,2,9,7,0.29,3,0,0,0,2,200.0,0,2000000.0,0,0
2,1,31.25,0.000112,1,0,6,2,0.0,2,1,1,0,0,96.0,19,960000.0,0,0
3,0,48.17,0.000133,1,0,7,6,0.335,2,0,0,0,0,0.0,120,0.0,0,0
4,1,32.33,0.00035,1,0,9,7,0.5,2,0,0,1,0,232.0,0,2320000.0,0,0


In [6]:
train_df['variable2'] = train_df['variable2'].fillna(round(train_df['variable2'].mean(), 2))
train_df['variable14'] = train_df['variable14'].fillna(round(train_df['variable14'].mean(), 2))
train_df['variable17'] = train_df['variable17'].fillna(round(train_df['variable17'].mean(), 2))

# Train Our Model

In [7]:
X_train = train_df[["variable1", "variable2", "variable3", "variable4", "variable5", "variable6", "variable7", "variable8", "variable9", "variable10", "variable11", "variable12", "variable13", "variable14", "variable15", "variable17", "variable19"]].values
y_train = train_df['classLabel'].values

In [8]:
LR = LogisticRegression()
LR.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
LR.coef_

array([[ 1.96324928e-05,  1.11022256e-03,  2.53474838e-08,
         2.82299459e-05, -6.27487098e-07,  1.68991619e-04,
         1.47888169e-04,  1.82217931e-04,  5.87743139e-05,
         2.46187207e-05,  1.92283270e-04,  1.50874434e-05,
         4.55505835e-06,  5.30337312e-09,  2.65905298e-03,
         6.92926371e-07,  4.83909682e-05]])

In [10]:
# We can neglect features with low coef.
X_train = train_df[[ "variable4", "variable6", "variable8", "variable10", "variable11", "variable12", "variable13", "variable15", "variable19"]].values
LR.fit(X_train, y_train)
LR.coef_



array([[-1.16227976e+00, -4.73034006e-02,  6.23497098e-02,
        -2.78453435e-02,  1.38868509e-01, -2.96772893e-02,
        -1.32804947e-01,  1.99532385e-04,  7.54137228e+00]])

### After trying many feature selection algorithms like SelectFromModel and RFE, & also tried different classification algorithms. NO selection algorithm make good improvement in the process of feature selection and the maximum accuracy score was 0.515.

### So I start see the coefficient of each feature and remove the features with the lowest coefficient, I did this several times until I reached the accuracy that I think it's the best on the Validation set    

# Start Working on Validation Data

In [11]:
val_df = pd.read_csv('Datasets/validation.csv', delimiter=';')
val_df.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,b,3233,75.0,u,g,e,bb,1585,t,f,0,t,s,420.0,0,4200000.0,,1,no.
1,b,2358,179.0,u,g,c,v,54,f,f,0,t,g,136.0,1,1360000.0,,0,no.
2,b,3642,0.00075,y,p,d,v,585,f,f,0,f,g,240.0,3,2400000.0,,1,no.
3,b,1842,10415.0,y,p,aa,v,125,t,f,0,f,g,120.0,375,1200000.0,,0,no.
4,b,245,13335.0,y,p,aa,v,4,f,f,0,t,g,120.0,475,1200000.0,f,1,no.


In [12]:
# Correct format of numbers
val_df['classLabel'] = val_df['classLabel'].map({'no.': 'no', 'yes.': 'yes'})
val_df['variable2'] = val_df['variable2'].replace({',': '.'},regex=True)
val_df['variable3'] = val_df['variable3'].replace({',': '.'},regex=True)
val_df['variable8'] = val_df['variable8'].replace({',': '.'},regex=True)
# Correct data types of columns
val_df['variable2'] = val_df['variable2'].astype('float64')
val_df['variable3'] = val_df['variable3'].astype('float64')
val_df['variable8'] = val_df['variable8'].astype('float64')

# Convert Categorical columns to its numerical codes
val_df['variable1'] = val_df['variable1'].astype("category").cat.codes
val_df['variable4'] = val_df['variable4'].astype("category").cat.codes
val_df['variable5'] = val_df['variable5'].astype("category").cat.codes
val_df['variable6'] = val_df['variable6'].astype("category").cat.codes
val_df['variable7'] = val_df['variable7'].astype("category").cat.codes
val_df['variable9'] = val_df['variable4'].astype("category").cat.codes
val_df['variable10'] = val_df['variable10'].astype("category").cat.codes
val_df['variable12'] = val_df['variable12'].astype("category").cat.codes
val_df['variable13'] = val_df['variable13'].astype("category").cat.codes

val_df['classLabel'] = val_df['classLabel'].astype("category").cat.codes

In [13]:
# Since varible18 has 2145 missing value and that approx. is 0.6 of the data set. Therefore we can neglect it
val_df.drop('variable18', axis=1, inplace=True)
# Drop NaN values from columns of type string
val_df = val_df.dropna(subset=['variable1', 'variable4', 'variable5', 'variable6', 'variable7'])
val_df.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable19,classLabel
0,1,32.33,0.00075,0,0,5,0,1.585,1,0,0,1,2,420.0,0,4200000.0,1,0
1,1,23.58,0.000179,0,0,2,6,0.54,1,0,0,1,0,136.0,1,1360000.0,0,0
2,1,36.42,7.5e-05,1,1,4,6,0.585,2,0,0,0,0,240.0,3,2400000.0,1,0
3,1,18.42,0.001042,1,1,1,6,0.125,2,0,0,0,0,120.0,375,1200000.0,0,0
4,1,24.5,0.001334,1,1,1,6,0.04,2,0,0,1,0,120.0,475,1200000.0,1,0


In [14]:
val_df['variable2'] = val_df['variable2'].fillna(round(val_df['variable2'].mean(), 2))
val_df['variable14'] = val_df['variable14'].fillna(round(val_df['variable14'].mean(), 2))
val_df['variable17'] = val_df['variable17'].fillna(round(val_df['variable17'].mean(), 2))

In [15]:
X_val = val_df[["variable4", "variable6", "variable8", "variable10", "variable11", "variable12", "variable13", "variable15", "variable19"]].values

In [16]:
predict = LR.predict(X_val)
predict

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1], dtype=int8)

In [17]:
from sklearn.metrics import accuracy_score, mean_squared_error, log_loss

In [18]:
y_val = val_df['classLabel'].values
accuracy_score(y_val, predict)

0.565

In [19]:
mean_squared_error(y_val, predict)

0.435

In [20]:
log_loss(y_val, predict)

15.0245916190666