In [None]:
#Use scikit learn imputer
#Scaler
#Treat data imbalance


## Read Files

In [213]:
import pandas as pd
import numpy as np

from sklearn.utils import resample

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [214]:
data = pd.read_csv("training.csv", sep =";", thousands=',')
print(data.shape)
print()
print(data.columns)

(3700, 19)

Index(['variable1', 'variable2', 'variable3', 'variable4', 'variable5',
       'variable6', 'variable7', 'variable8', 'variable9', 'variable10',
       'variable11', 'variable12', 'variable13', 'variable14', 'variable15',
       'variable17', 'variable18', 'variable19', 'classLabel'],
      dtype='object')


In [215]:
print(data.head(10))

  variable1  variable2   variable3 variable4 variable5 variable6 variable7  \
0         a     1792.0     0.00054         u         g         c         v   
1         b     1692.0     0.00335         y         p         k         v   
2         b     3125.0  1125.00000         u         g        ff        ff   
3         a     4817.0  1335.00000         u         g         i         o   
4         b     3233.0    35.00000         u         g         k         v   
5         a     3483.0   125.00000         y         p         i         h   
6         a     2617.0     0.00020         u         g         j         j   
7         b     2117.0     0.00875         y         p         c         h   
8         b     2892.0     0.00375         u         g         c         v   
9         b     1817.0  1025.00000         u         g         c         h   

   variable8 variable9 variable10  variable11 variable12 variable13  \
0        175         f          t           1          t          g   

In [216]:
data2 = pd.read_csv("validation.csv", sep =";", thousands=',')
print(data2.shape)
print()
print(data2.columns)

(200, 19)

Index(['variable1', 'variable2', 'variable3', 'variable4', 'variable5',
       'variable6', 'variable7', 'variable8', 'variable9', 'variable10',
       'variable11', 'variable12', 'variable13', 'variable14', 'variable15',
       'variable17', 'variable18', 'variable19', 'classLabel'],
      dtype='object')


## Preprocessing

In [217]:
#Merge both datasets for mean and median calculations to take both into consideration
data = data.append(data2)
print(data.shape)

(3900, 19)


### Missing values

In [218]:
#Check for missing values
print(data.isnull().sum())

variable1       42
variable2       42
variable3        0
variable4       66
variable5       66
variable6       69
variable7       69
variable8        0
variable9        0
variable10       0
variable11       0
variable12       0
variable13       0
variable14     103
variable15       0
variable17     103
variable18    2256
variable19       0
classLabel       0
dtype: int64


In [219]:
#Drop columns that are having maximum number of null values in them
# Set the limit, Drop columns using that limit
limit = len(data) * 0.8
data = data.dropna(axis=1, thresh=limit)

print(data.columns)

Index(['variable1', 'variable2', 'variable3', 'variable4', 'variable5',
       'variable6', 'variable7', 'variable8', 'variable9', 'variable10',
       'variable11', 'variable12', 'variable13', 'variable14', 'variable15',
       'variable17', 'variable19', 'classLabel'],
      dtype='object')


In [220]:
#Fill numerical NaNs

#The mean of the numerical column data is used to replace null values when the data is normally distributed. Median is used 
#if the data comprised of outliers. Mode is used when the data having more occurences of a particular value or more frequent 
#value.

#Therefore get Summary Statistics of the data
data.describe()

Unnamed: 0,variable2,variable3,variable8,variable11,variable14,variable15,variable17,variable19
count,3858.0,3900.0,3900.0,3900.0,3797.0,3900.0,3797.0,3900.0
mean,2703.777346,1211.693287,704.745641,4.09,164.625494,2165.322051,1646255.0,0.90359
std,1685.206694,3165.764751,1895.032011,6.651155,159.43067,8497.227982,1594307.0,0.295191
min,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1917.0,0.00083,8.0,0.0,0.0,0.0,0.0,1.0
50%,2558.0,55.0,71.0,1.0,120.0,100.0,1200000.0,1.0
75%,3667.0,476.0,375.0,6.0,280.0,1000.0,2800000.0,1.0
max,8025.0,26335.0,14415.0,67.0,2000.0,100000.0,20000000.0,1.0


In [221]:
#Variable2: Large gap between max and 75th percentile than min and 75th percentile (Most of the points are towards lower values),
#then data isn't evenly distributed, dont use mean. Use median.
#Variable3, 8, 11, 14, 15, 17: Most of the points are towards higher values.
#Variable19: use mean

data['variable2'] = data['variable2'].replace(np.nan, data['variable2'].median())
data['variable3'] = data['variable3'].replace(np.nan, data['variable3'].median())
data['variable8'] = data['variable8'].replace(np.nan, data['variable8'].median())
data['variable11'] = data['variable11'].replace(np.nan, data['variable11'].median())
data['variable14'] = data['variable14'].replace(np.nan, data['variable14'].median())
data['variable15'] = data['variable15'].replace(np.nan, data['variable15'].median())
data['variable17'] = data['variable17'].replace(np.nan, data['variable17'].median())
data['variable19'] = data['variable19'].replace(np.nan, data['variable19'].median())

In [222]:
#Re-check
print(data.isnull().sum())

variable1     42
variable2      0
variable3      0
variable4     66
variable5     66
variable6     69
variable7     69
variable8      0
variable9      0
variable10     0
variable11     0
variable12     0
variable13     0
variable14     0
variable15     0
variable17     0
variable19     0
classLabel     0
dtype: int64


In [223]:
#Categorical data
#Since count of missing values is small relative to whole training set, replace with most repeating value
data['variable1'] = data['variable1'].fillna(data['variable1'].value_counts().index[0])
data['variable4'] = data['variable4'].fillna(data['variable4'].value_counts().index[0])
data['variable5'] = data['variable5'].fillna(data['variable5'].value_counts().index[0])
data['variable6'] = data['variable6'].fillna(data['variable6'].value_counts().index[0])
data['variable7'] = data['variable7'].fillna(data['variable7'].value_counts().index[0])

In [224]:
#Re-check
print(data.isnull().sum())

variable1     0
variable2     0
variable3     0
variable4     0
variable5     0
variable6     0
variable7     0
variable8     0
variable9     0
variable10    0
variable11    0
variable12    0
variable13    0
variable14    0
variable15    0
variable17    0
variable19    0
classLabel    0
dtype: int64


In [225]:
print(data.head(10))

  variable1  variable2   variable3 variable4 variable5 variable6 variable7  \
0         a     1792.0     0.00054         u         g         c         v   
1         b     1692.0     0.00335         y         p         k         v   
2         b     3125.0  1125.00000         u         g        ff        ff   
3         a     4817.0  1335.00000         u         g         i         o   
4         b     3233.0    35.00000         u         g         k         v   
5         a     3483.0   125.00000         y         p         i         h   
6         a     2617.0     0.00020         u         g         j         j   
7         b     2117.0     0.00875         y         p         c         h   
8         b     2892.0     0.00375         u         g         c         v   
9         b     1817.0  1025.00000         u         g         c         h   

   variable8 variable9 variable10  variable11 variable12 variable13  \
0        175         f          t           1          t          g   

### Check for imbalance

In [226]:
data.loc[data['classLabel'] == 'no.']

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable19,classLabel
0,a,1792.0,0.00054,u,g,c,v,175,f,t,1,t,g,80.0,5,800000.0,0,no.
1,b,1692.0,0.00335,y,p,k,v,29,f,f,0,f,s,200.0,0,2000000.0,0,no.
2,b,3125.0,1125.00000,u,g,ff,ff,0,f,t,1,f,g,96.0,19,960000.0,0,no.
3,a,4817.0,1335.00000,u,g,i,o,335,f,f,0,f,g,0.0,120,0.0,0,no.
4,b,3233.0,35.00000,u,g,k,v,5,f,f,0,t,g,232.0,0,2320000.0,0,no.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,b,1742.0,65.00000,u,g,i,v,125,f,f,0,f,g,60.0,100,600000.0,1,no.
165,b,4117.0,1335.00000,u,g,d,v,165,f,f,0,f,g,168.0,0,1680000.0,0,no.
166,b,2725.0,0.00625,u,g,aa,v,455,t,f,0,t,g,200.0,0,2000000.0,1,no.
167,b,3633.0,379.00000,u,g,W,v,1165,t,f,0,t,g,200.0,0,2000000.0,0,no.


### Encoding categorical variables

In [227]:
#First split data into X and Y
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]

print(Y)

0       no.
1       no.
2       no.
3       no.
4       no.
       ... 
195    yes.
196    yes.
197    yes.
198    yes.
199    yes.
Name: classLabel, Length: 3900, dtype: object


In [228]:
# One-hot encoding of categorical input
print(X.shape)
X = pd.get_dummies(X, columns=['variable1','variable4','variable5','variable6','variable7','variable9','variable10',
                                     'variable12','variable13'])
print()
print(X.shape)
print(X.head())

(3900, 17)

(3900, 48)
   variable2   variable3  variable8  variable11  variable14  variable15  \
0     1792.0     0.00054        175           1        80.0           5   
1     1692.0     0.00335         29           0       200.0           0   
2     3125.0  1125.00000          0           1        96.0          19   
3     4817.0  1335.00000        335           0         0.0         120   
4     3233.0    35.00000          5           0       232.0           0   

   variable17  variable19  variable1_a  variable1_b  ...  variable7_z  \
0    800000.0           0            1            0  ...            0   
1   2000000.0           0            0            1  ...            0   
2    960000.0           0            0            1  ...            0   
3         0.0           0            1            0  ...            0   
4   2320000.0           0            0            1  ...            0   

   variable9_f  variable9_t  variable10_f  variable10_t  variable12_f  \
0            1

In [229]:
#Label encoding 
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
labels = list(Y)
print(labels)
print(len(labels))
labels = le.fit_transform(labels)
print()
print(len(labels))
print(labels)

Y = pd.DataFrame(labels, columns=['classLabel'])


['no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.

In [230]:
#split data into train and validation
X_train = X.iloc[:-200, :]
X_valid = X.iloc[3700:3900, :]
Y_train = Y.iloc[:-200]
Y_valid = Y.iloc[3700:3900]

print(X.shape)
print(Y.shape)
print()
print(X_train.shape)
print(X_valid.shape)
print(Y_train.shape)
print(Y_valid.shape)

(3900, 48)
(3900, 1)

(3700, 48)
(200, 48)
(3700, 1)
(200, 1)


In [231]:
#Feature scaling for X
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_valid = sc_X.transform(X_valid) 


In [236]:
#upsample minority class (no.)

#combine them back for resampling
train_data = pd.concat([pd.DataFrame(X_train), pd.DataFrame(Y_train)], axis=1)
print(train_data)

# separate minority and majority classes
negative = train_data.loc[train_data['classLabel'] == 0]
positive = train_data.loc[train_data['classLabel'] == 1]
print(negative.shape)

# upsample minority, ample with replacement, match number in majority class
neg_upsampled = resample(negative, replace=True, n_samples=len(positive))

# combine majority and upsampled minority
upsampled = pd.concat([positive, neg_upsampled])
print(upsampled.shape)
                     
X_train = upsampled.iloc[:, :-1]
Y_train = upsampled.iloc[:, -1]
print(X_train.shape)
print(Y_train.shape)


            0         1         2         3         4         5         6  \
39   1.206365 -0.382679 -0.230793 -0.468173  1.112959 -0.155581  1.112959   
40   0.398651 -0.382678 -0.178992 -0.616329  0.898752 -0.252970  0.898752   
41  -1.577216 -0.355809 -0.365993 -0.468173 -0.477363 -0.028333 -0.477363   
42   0.368912 -0.349803 -0.339574 -0.616329  0.015961  3.334901  0.015961   
43  -1.589111 -0.382676 -0.319372 -0.616329 -0.010003  0.414968 -0.010003   
..        ...       ...       ...       ...       ...       ...       ...   
12  -0.523262 -0.382363 -0.287774 -0.616329  1.158397 -0.258023  1.158397   
32  -1.574837 -0.377937 -0.373245 -0.616329 -0.399470 -0.254922 -0.399470   
83   1.478775 -0.382679 -0.365475 -0.616329  0.119819 -0.257563  0.119819   
486  0.467645 -0.374776 -0.371691 -0.616329  0.249641 -0.258023  0.249641   
26  -1.577216  1.066708 -0.372727 -0.616329  0.509286 -0.258023  0.509286   

            7         8         9  ...        39        40        41  \
39 

## Classifying

### Logistic Regression

In [241]:
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)

Y_pred  = classifier.predict(X_valid)
print(confusion_matrix(Y_valid , Y_pred))
print(classification_report(Y_valid, Y_pred))

[[53 54]
 [47 46]]
              precision    recall  f1-score   support

           0       0.53      0.50      0.51       107
           1       0.46      0.49      0.48        93

    accuracy                           0.49       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.50      0.49      0.50       200





### KNN

In [242]:
classifier = KNeighborsClassifier()

#Use gridsearch to find optimum k
#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(classifier, params_knn, cv=5)
#fit model to training data
knn_gs.fit(X_train, Y_train)

knn_best = knn_gs.best_estimator_
#check best n_neigbors value
print(knn_gs.best_params_)

{'n_neighbors': 1}


In [243]:
y_pred = knn_best.predict(X_valid)
print(confusion_matrix(Y_valid , Y_pred))
print(classification_report(Y_valid, Y_pred))

[[53 54]
 [47 46]]
              precision    recall  f1-score   support

           0       0.53      0.50      0.51       107
           1       0.46      0.49      0.48        93

    accuracy                           0.49       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.50      0.49      0.50       200



In [244]:
#No gridsearch
classifier = KNeighborsClassifier(n_neighbors = 11)
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(X_valid)
print(confusion_matrix(Y_valid , Y_pred))
print(classification_report(Y_valid, Y_pred))

[[53 54]
 [47 46]]
              precision    recall  f1-score   support

           0       0.53      0.50      0.51       107
           1       0.46      0.49      0.48        93

    accuracy                           0.49       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.50      0.49      0.50       200



### Random Forest Classifier

In [245]:
#No gridsearch
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy' , random_state = 0)
classifier.fit(X_train , Y_train) 
y_pred = classifier.predict(X_valid)
print(confusion_matrix(Y_valid , Y_pred))
print(classification_report(Y_valid, Y_pred))

[[53 54]
 [47 46]]
              precision    recall  f1-score   support

           0       0.53      0.50      0.51       107
           1       0.46      0.49      0.48        93

    accuracy                           0.49       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.50      0.49      0.50       200

