# 1. Importing Necessary Modules

In [35]:
# importing necessary modules for this notebook

import pandas as pd
from sklearn.preprocessing import scale #for scaling
from sklearn.model_selection import train_test_split # for train and test data split
from sklearn.linear_model import LogisticRegression # for Logistic Regression Classification model
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score,roc_curve # for evaluation of classification models
from sklearn.tree import DecisionTreeClassifier # for Decision Tree Classifier model
from sklearn.ensemble import RandomForestClassifier # for Random Forest Classifier model
from sklearn.tree import DecisionTreeClassifier # for Decision Tree Classifier model
from sklearn.neighbors import KNeighborsClassifier # for K Neighbor Classifier model
from sklearn.naive_bayes import GaussianNB # for Naive Bayes Classification model
from sklearn.svm import SVC # for svm Classification model
from sklearn.model_selection import GridSearchCV # for hyper parameter tuning using grid search method
from imblearn.over_sampling import SMOTE # for data balancing using smote method

# 2. Loading The Preprocessed Data

In [25]:
# Loading the preprocessed dataser "Preprocessed_Data"

data=pd.read_csv("Preprocessed_Data.csv")

# Dataset Features

data.shape

(583, 10)

In [26]:
data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,187,16,18,6.8,3.3,0.9,1
1,62,1,1.0,208,64,100,7.5,3.2,0.74,1
2,62,1,1.0,208,60,68,7.0,3.3,0.89,1
3,58,1,1.0,182,14,20,6.8,3.4,1.0,1
4,72,1,1.0,195,27,59,7.3,2.4,0.4,1


# 3. X and y split

In [27]:
# splitting the data into independent and target variables X and y repectively

# storing Target column "Dataset" in y
y=data['Dataset']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Dataset, dtype: int64

In [29]:
# storing independent columns in X
X=data.drop(columns=['Dataset'],axis=1)
X.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,65,0,0.7,187,16,18,6.8,3.3,0.9
1,62,1,1.0,208,64,100,7.5,3.2,0.74
2,62,1,1.0,208,60,68,7.0,3.3,0.89
3,58,1,1.0,182,14,20,6.8,3.4,1.0
4,72,1,1.0,195,27,59,7.3,2.4,0.4


# 4. Scaling the X values

In [39]:
# scaling the X values using standardization technique

X_scaled=pd.DataFrame(scale(X),columns=X.columns)
X_scaled.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,1.252098,-1.762281,-1.322873,-0.996681,-0.354665,-0.90554,0.287534,0.198969,-0.099436
1,1.066637,0.567446,0.72365,-0.241023,-0.091599,1.473568,0.976568,0.073157,-0.673765
2,1.066637,0.567446,0.72365,-0.241023,-0.113522,0.545135,0.484401,0.198969,-0.135332
3,0.819356,0.567446,0.72365,-1.1766,-0.365626,-0.847513,0.287534,0.324781,0.259519
4,1.684839,0.567446,0.72365,-0.708811,-0.294379,0.284014,0.779701,-0.93334,-1.894214


# 5. Test and Train Split

In [41]:
# splitting the data into train and test dataset into 7:3 ratio

X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.3,random_state=0)

In [42]:
X_train.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
271,-2.518935,0.567446,-0.640699,-0.241023,0.390688,-0.209216,-0.007767,0.073157,-0.099436
318,-0.417048,0.567446,0.72365,0.046847,0.538663,-0.209216,1.271869,1.708714,1.336386
552,0.015694,0.567446,-2.005047,-0.672828,-0.283418,-0.557378,-0.696801,-0.30428,0.259519
579,-0.293407,0.567446,-2.005047,-0.241023,-0.250535,-0.528364,-0.499934,0.073157,0.618475
196,0.942997,0.567446,0.775135,-0.241023,-0.157366,1.589622,-0.499934,-1.310776,-1.427572


In [44]:
X_test.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
246,0.633896,0.567446,0.775135,2.061935,-0.321782,0.864284,-0.401501,-0.555904,-0.817347
92,0.942997,0.567446,0.72365,2.997512,1.81563,-0.209216,0.878135,-0.178468,-1.176303
386,-0.417048,0.567446,0.72365,-0.241023,-0.305341,-0.818499,0.976568,1.331277,0.977431
186,0.942997,0.567446,0.72365,-0.241023,-0.009391,2.779175,1.271869,0.073157,-0.853243
389,1.684839,0.567446,-2.005047,-0.241023,-0.272457,-0.41231,-0.204634,0.073157,0.259519


In [46]:
y_train.head()

271    2
318    1
552    1
579    1
196    1
Name: Dataset, dtype: int64

In [55]:
y_test.head()

246    1
92     1
386    2
186    1
389    1
Name: Dataset, dtype: int64

# 6. Balancing the data

In [59]:
# Balncing the train data using smote method

smote=SMOTE()

X_train_smote,y_train_smote=smote.fit_resample(X_train,y_train)


# 7. Training and Testing the model

In [53]:
# The algorithms used in this model are

#  1.logistic regression
#  2.decision tree model
#  3.Random forest model
#  4.KNN
#  5.Naive Bayes
#  6.svm


# 7.1. Logistic Regression Algorithm 

In [60]:
#initializing the model

lr=LogisticRegression()


In [63]:
# hyper parameter tuning the model

parameters={
          "random_state":[0,1,42]
           }

gslr=GridSearchCV(estimator=lr,param_grid=parameters)

gslr.fit(X_train_smote,y_train_smote)

gslr.best_score_

0.6955526582645226

In [66]:
gslr.best_params_ # finding the best parameters for this model

{'random_state': 0}

In [68]:
lr1=LogisticRegression(random_state=0)

In [70]:
#Training the model

lr1.fit(X_train_smote,y_train_smote)
lr1_train_predict=lr1.predict(X_train_smote)
lr1_train_predict


array([2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2,
       2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2,
       2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1,
       2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2,
       2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1,
       2, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1,
       2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2,
       1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2,
       1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2,
       1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1,
       2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2,

In [71]:
# Testing The model

lr1_test_predict=lr1.predict(X_test)
lr1_test_predict


array([1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2,
       2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2,
       1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 2,
       1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2,
       1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2,
       2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 1,
       2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2],
      dtype=int64)

# 7.2. Decision Tree Classifier

In [74]:
#initializing the model

dt=DecisionTreeClassifier()

In [76]:
# hyper parameter tuning the model

parameters={
           "criterion":["gini", "entropy", "log_loss"],
           "random_state":[0,1,42]
           }

gsdt=GridSearchCV(estimator=dt,param_grid=parameters)

gsdt.fit(X_train_smote,y_train_smote)

gsdt.best_score_

0.7330291177748804

In [77]:
gsdt.best_params_ # finding the best parameters for this model

{'criterion': 'entropy', 'random_state': 0}

In [79]:
dt1=DecisionTreeClassifier(criterion='entropy',random_state=0)

In [80]:
#Training the model

dt1.fit(X_train_smote,y_train_smote)
dt1_train_predict=dt1.predict(X_train_smote)
dt1_train_predict


array([2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1,
       1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
       2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2,
       1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2,
       1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2,
       1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1,
       1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2,

In [82]:
# Testing The model

dt1_test_predict=dt1.predict(X_test)
dt1_test_predict


array([1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1,
       2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2,
       1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1,
       2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1,
       2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1,
       1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2],
      dtype=int64)

# 7.3. Random Forest Classifier

In [84]:
#initializing the model

rf=RandomForestClassifier()

In [85]:
# hyper parameter tuning the model

parameters={
           "criterion":["gini", "entropy", "log_loss"],
           "random_state":[0,1,42]
           }

gsrf=GridSearchCV(estimator=rf,param_grid=parameters)

gsrf.fit(X_train_smote,y_train_smote)

gsrf.best_score_

0.8112270027524264

In [86]:
gsrf.best_params_ # finding the best parameters for this model

{'criterion': 'gini', 'random_state': 0}

In [87]:
rf1=RandomForestClassifier(criterion='gini',random_state=0)

In [88]:
#Training the model

rf1.fit(X_train_smote,y_train_smote)
rf1_train_predict=rf1.predict(X_train_smote)
rf1_train_predict

array([2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1,
       1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
       2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2,
       1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2,
       1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2,
       1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1,
       1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2,

In [90]:
# Testing The model

rf1_test_predict=rf1.predict(X_test)
rf1_test_predict


array([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1,
       2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1,
       1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1],
      dtype=int64)

# 7.4 K Nearest Neighbors Classifier

In [92]:
#initializing the model

knn=KNeighborsClassifier()

In [93]:
# hyper parameter tuning the model

parameters={
           "n_neighbors":[5,7,9]
           }

gsknn=GridSearchCV(estimator=knn,param_grid=parameters)

gsknn.fit(X_train_smote,y_train_smote)

gsknn.best_score_

0.7414892075909025

In [94]:
gsknn.best_params_ # finding the best parameters for this model

{'n_neighbors': 5}

In [95]:
knn1=KNeighborsClassifier(n_neighbors=5)

In [96]:
#Training the model

knn1.fit(X_train_smote,y_train_smote)
knn1_train_predict=knn1.predict(X_train_smote)
knn1_train_predict

array([2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1,
       2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2,
       2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 2,
       2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2,
       1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1,
       2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 1, 2,
       1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2,
       1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2,
       1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 2,
       2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2,
       1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 2,

In [97]:
# Testing The model

knn1_test_predict=knn1.predict(X_test)
knn1_test_predict

array([1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1,
       2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1,
       1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2,
       1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 1,
       1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1,
       2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2,
       1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1,
       2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2],
      dtype=int64)

# 7.5. Naive Bayes Algorithm

In [99]:
#initializing the model

nb1=GaussianNB()

In [100]:
#Training the model

nb1.fit(X_train_smote,y_train_smote)
nb1_train_predict=nb1.predict(X_train_smote)
nb1_train_predict

array([1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2,
       2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2,
       1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2,
       2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2,
       2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2,
       2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2,
       1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2,
       1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1,
       2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2,

In [101]:
# Testing The model

nb1_test_predict=nb1.predict(X_test)
nb1_test_predict

array([1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2,
       2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2,
       2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2,
       1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2],
      dtype=int64)

# 7.6. Support Vector Machine

In [102]:
#initializing the model

svc=SVC()

In [103]:
# hyper parameter tuning the model

parameters={
           "kernel":['linear','rbf'],
           "random_state":[0,1,42]
           }

gssvc=GridSearchCV(estimator=svc,param_grid=parameters)

gssvc.fit(X_train_smote,y_train_smote)

gssvc.best_score_

0.7278574532811821

In [104]:
gssvc.best_params_ # finding the best parameters for this model

{'kernel': 'rbf', 'random_state': 0}

In [105]:
svc1=SVC(kernel='rbf', random_state= 0)

In [106]:
#Training the model

svc1.fit(X_train_smote,y_train_smote)
svc1_train_predict=svc1.predict(X_train_smote)
svc1_train_predict

array([2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2,
       2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2,
       1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1,
       2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2,
       2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1,
       1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1,
       2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1,
       1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2,
       1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2,
       1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2,
       1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2,
       1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1,
       2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2,

In [107]:
# Testing The model

svc1_test_predict=svc1.predict(X_test)
svc1_test_predict

array([1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2,
       1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1,
       1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 2,
       1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2,
       1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2,
       2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1,
       2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2],
      dtype=int64)

# 8. Evaluating the performance of the models

In [108]:
# The models are evaluated using
# 1. accuracy score
# 2. confusion matrix
# 3. classification report#

# 8.1. Accuracy score

In [113]:
# logistic regression Algorithm

print("1. logistic regression Algorithm\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,lr1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,lr1_test_predict))

1. logistic regression Algorithm

On Training data:  0.70578231292517
On Testing Data:  0.68


In [115]:
# Decision Tree Classifier

print("2. Decision Tree Classifier\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,dt1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,dt1_test_predict))

2. Decision Tree Classifier

On Training data:  1.0
On Testing Data:  0.6571428571428571


In [117]:
# Random Forest Classifier

print("3. Random Forest Classifier\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,rf1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,rf1_test_predict))

3. Random Forest Classifier

On Training data:  1.0
On Testing Data:  0.6971428571428572


In [119]:
# K Nearest Neighbors

print("4. K Nearest Neighbors\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,knn1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,knn1_test_predict))

4. K Nearest Neighbors

On Training data:  0.8333333333333334
On Testing Data:  0.6228571428571429


In [121]:
# Naive Bayes

print("5. Naive Bayes\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,nb1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,nb1_test_predict))

5. Naive Bayes

On Training data:  0.6479591836734694
On Testing Data:  0.52


In [122]:
# Support Vector Machine

print("6. Support Vector Machine\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,svc1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,svc1_test_predict))

6. Support Vector Machine

On Training data:  0.782312925170068
On Testing Data:  0.6457142857142857


# 8.2. confusion matrix

In [129]:
# logistic regression Algorithm

print("1. logistic regression Algorithm")
confusion_matrix(y_test,lr1_test_predict)

1. logistic regression Algorithm


array([[76, 46],
       [10, 43]], dtype=int64)

In [131]:
# Decision Tree Classifier

print("2. Decision Tree Classifier")
confusion_matrix(y_test,dt1_test_predict)

2. Decision Tree Classifier


array([[88, 34],
       [26, 27]], dtype=int64)

In [132]:
# Random Forest Classifier

print("3. Random Forest Classifier")
confusion_matrix(y_test,rf1_test_predict)

3. Random Forest Classifier


array([[100,  22],
       [ 31,  22]], dtype=int64)

In [133]:
# K Nearest Neighbors

print("4. K Nearest Neighbors")
confusion_matrix(y_test,knn1_test_predict)

4. K Nearest Neighbors


array([[78, 44],
       [22, 31]], dtype=int64)

In [134]:
# Naive Bayes

print("5. Naive Bayes")
confusion_matrix(y_test,nb1_test_predict)

5. Naive Bayes


array([[44, 78],
       [ 6, 47]], dtype=int64)

In [135]:
# Support Vector Machine

print("6. Support Vector Machine")
confusion_matrix(y_test,svc1_test_predict)

6. Support Vector Machine


array([[75, 47],
       [15, 38]], dtype=int64)

# 8.3. Classification Report

In [143]:
# logistic regression Algorithm

print("1. logistic regression Algorithm")
print(classification_report(y_test,lr1_test_predict))

1. logistic regression Algorithm
              precision    recall  f1-score   support

           1       0.88      0.62      0.73       122
           2       0.48      0.81      0.61        53

    accuracy                           0.68       175
   macro avg       0.68      0.72      0.67       175
weighted avg       0.76      0.68      0.69       175



In [144]:
# Decision Tree Classifier

print("2. Decision Tree Classifier")
print(classification_report(y_test,dt1_test_predict))

2. Decision Tree Classifier
              precision    recall  f1-score   support

           1       0.77      0.72      0.75       122
           2       0.44      0.51      0.47        53

    accuracy                           0.66       175
   macro avg       0.61      0.62      0.61       175
weighted avg       0.67      0.66      0.66       175



In [145]:
# Random Forest Classifier

print("3.  Random Forest Classifier")
print(classification_report(y_test,rf1_test_predict))

3.  Random Forest Classifier
              precision    recall  f1-score   support

           1       0.76      0.82      0.79       122
           2       0.50      0.42      0.45        53

    accuracy                           0.70       175
   macro avg       0.63      0.62      0.62       175
weighted avg       0.68      0.70      0.69       175



In [146]:
# K Nearest Neighbors

print("4. K Nearest Neighbors")
print(classification_report(y_test,knn1_test_predict))

4. K Nearest Neighbors
              precision    recall  f1-score   support

           1       0.78      0.64      0.70       122
           2       0.41      0.58      0.48        53

    accuracy                           0.62       175
   macro avg       0.60      0.61      0.59       175
weighted avg       0.67      0.62      0.64       175



In [147]:
# Naive Bayes

print("5. Naive Bayes")
print(classification_report(y_test,nb1_test_predict))

5. Naive Bayes
              precision    recall  f1-score   support

           1       0.88      0.36      0.51       122
           2       0.38      0.89      0.53        53

    accuracy                           0.52       175
   macro avg       0.63      0.62      0.52       175
weighted avg       0.73      0.52      0.52       175



In [148]:
# Support Vector Machine

print("6. Support Vector Machine")
print(classification_report(y_test,svc1_test_predict))

6. Support Vector Machine
              precision    recall  f1-score   support

           1       0.83      0.61      0.71       122
           2       0.45      0.72      0.55        53

    accuracy                           0.65       175
   macro avg       0.64      0.67      0.63       175
weighted avg       0.72      0.65      0.66       175



# 9. Selecting and saving the model

In [149]:
# From the performance of each model "Naive Bayes" gives the best performance
# so this algorithm is used to predict the result for our model

# Saving the model

from joblib import Parallel, delayed
import joblib

# save the model as a pickle in a file
joblib.dump(nb1, 'selected_model.pkl')

['selected_model.pkl']