# 1. Importing Necessary Modules

In [1]:
# importing necessary modules for this notebook

import pandas as pd
from sklearn.preprocessing import scale #for scaling
from sklearn.model_selection import train_test_split # for train and test data split
from sklearn.linear_model import LogisticRegression # for Logistic Regression Classification model
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score,roc_curve # for evaluation of classification models
from sklearn.tree import DecisionTreeClassifier # for Decision Tree Classifier model
from sklearn.ensemble import RandomForestClassifier # for Random Forest Classifier model
from sklearn.tree import DecisionTreeClassifier # for Decision Tree Classifier model
from sklearn.neighbors import KNeighborsClassifier # for K Neighbor Classifier model
from sklearn.naive_bayes import GaussianNB # for Naive Bayes Classification model
from sklearn.svm import SVC # for svm Classification model
from sklearn.model_selection import GridSearchCV # for hyper parameter tuning using grid search method
from imblearn.over_sampling import SMOTE # for data balancing using smote method

# 2. Loading The Preprocessed Data

In [2]:
# Loading the preprocessed dataset "Preprocessed_Data1"

data=pd.read_csv("Preprocessed_Data1.csv")

# Dataset Features

data.shape

(583, 11)

In [3]:
data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


# 3. X and y split

In [4]:
# splitting the data into independent and target variables X and y repectively

# storing Target column "Dataset" in y
y=data['Dataset']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Dataset, dtype: int64

In [5]:
# storing independent columns in X
X=data.drop(columns=['Dataset'],axis=1)
X.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4


# 4. Scaling the X values

In [6]:
# scaling the X values using normalization technique

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler().fit(X)
X_scaled=pd.DataFrame(data=scaler.transform(X), columns=X.columns)

X_scaled.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,0.709302,0.0,0.004021,0.0,0.060576,0.003015,0.001626,0.594203,0.521739,0.24
1,0.674419,1.0,0.140751,0.27551,0.310699,0.027136,0.018296,0.695652,0.5,0.176
2,0.674419,1.0,0.092493,0.204082,0.208598,0.025126,0.011791,0.623188,0.521739,0.236
3,0.627907,1.0,0.008043,0.015306,0.058134,0.00201,0.002033,0.594203,0.543478,0.28
4,0.790698,1.0,0.046917,0.096939,0.064485,0.008543,0.009961,0.666667,0.326087,0.04


# 5. Test and Train Split

In [7]:
# splitting the data into train and test dataset into 7:3 ratio

X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.3,random_state=0)

In [8]:
X_train.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
271,0.0,1.0,0.005362,0.005102,0.193942,0.071357,0.044928,0.550725,0.5,0.24
318,0.395349,1.0,0.044236,0.107143,0.074744,0.084925,0.045131,0.73913,0.782609,0.4
552,0.476744,1.0,0.002681,0.0,0.064973,0.009548,0.004066,0.449275,0.434783,0.28
579,0.418605,1.0,0.002681,0.0,0.017098,0.012563,0.004269,0.478261,0.5,0.32
196,0.651163,1.0,0.021448,0.05102,0.2936,0.021106,0.01911,0.478261,0.26087,0.092


In [9]:
X_test.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
246,0.593023,1.0,0.018767,0.454082,0.102101,0.00603,0.014027,0.492754,0.391304,0.16
92,0.651163,1.0,0.10992,0.19898,0.114802,0.20201,0.170766,0.681159,0.456522,0.12
386,0.395349,1.0,0.030831,0.066327,0.020518,0.007538,0.002236,0.695652,0.717391,0.36
186,0.651163,1.0,0.037534,0.086735,0.335613,0.034673,0.027445,0.73913,0.5,0.156
389,0.790698,1.0,0.002681,0.0,0.019052,0.010553,0.005082,0.521739,0.5,0.28


In [10]:
y_train.head()

271    2
318    1
552    1
579    1
196    1
Name: Dataset, dtype: int64

In [11]:
y_test.head()

246    1
92     1
386    2
186    1
389    1
Name: Dataset, dtype: int64

# 6. Balancing the data

In [12]:
# Balncing the train data using smote method

y_train.value_counts()


1    294
2    114
Name: Dataset, dtype: int64

In [13]:
smote=SMOTE()

X_train_smote,y_train_smote=smote.fit_resample(X_train,y_train)

y_train_smote.value_counts()

2    294
1    294
Name: Dataset, dtype: int64

# 7. Training and Testing the model

In [14]:
# The algorithms used in this model are

#  1.logistic regression
#  2.decision tree model
#  3.Random forest model
#  4.KNN
#  5.Naive Bayes
#  6.svm


# 7.1. Logistic Regression Algorithm 

In [15]:
#initializing the model

lr=LogisticRegression()


In [16]:
# hyper parameter tuning the model

parameters={
          "random_state":[0,1,42]
           }

gslr=GridSearchCV(estimator=lr,param_grid=parameters)

gslr.fit(X_train_smote,y_train_smote)

gslr.best_score_

0.6700130378096479

In [17]:
gslr.best_params_ # finding the best parameters for this model

{'random_state': 0}

In [18]:
lr1=LogisticRegression(random_state=0)

In [19]:
#Training the model

lr1.fit(X_train_smote,y_train_smote)
lr1_train_predict=lr1.predict(X_train_smote)
lr1_train_predict


array([2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1,
       2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1,
       2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2,
       2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1,
       2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1,
       1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2,
       1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2,
       1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2,
       1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1,
       1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1,

In [20]:
# Testing The model

lr1_test_predict=lr1.predict(X_test)
lr1_test_predict


array([1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2,
       2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2,
       1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1,
       1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2],
      dtype=int64)

# 7.2. Decision Tree Classifier

In [21]:
#initializing the model

dt=DecisionTreeClassifier()

In [22]:
# hyper parameter tuning the model

parameters={
           "criterion":["gini", "entropy", "log_loss"],
           "random_state":[0,1,42]
           }

gsdt=GridSearchCV(estimator=dt,param_grid=parameters)

gsdt.fit(X_train_smote,y_train_smote)

gsdt.best_score_

0.7687382297551789

In [23]:
gsdt.best_params_ # finding the best parameters for this model

{'criterion': 'entropy', 'random_state': 0}

In [24]:
dt1=DecisionTreeClassifier(criterion='entropy',random_state=0)

In [25]:
#Training the model

dt1.fit(X_train_smote,y_train_smote)
dt1_train_predict=dt1.predict(X_train_smote)
dt1_train_predict


array([2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1,
       1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
       2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2,
       1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2,
       1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2,
       1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1,
       1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2,

In [26]:
# Testing The model

dt1_test_predict=dt1.predict(X_test)
dt1_test_predict


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2,
       1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2,
       1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2,
       1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1,
       1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1,
       1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1],
      dtype=int64)

# 7.3. Random Forest Classifier

In [27]:
#initializing the model

rf=RandomForestClassifier()

In [28]:
# hyper parameter tuning the model

parameters={
           "criterion":["gini", "entropy", "log_loss"],
           "random_state":[0,1,42]
           }

gsrf=GridSearchCV(estimator=rf,param_grid=parameters)

gsrf.fit(X_train_smote,y_train_smote)

gsrf.best_score_

0.8248442706069824

In [29]:
gsrf.best_params_ # finding the best parameters for this model

{'criterion': 'entropy', 'random_state': 42}

In [30]:
rf1=RandomForestClassifier(criterion='entropy',random_state=42)

In [31]:
#Training the model

rf1.fit(X_train_smote,y_train_smote)
rf1_train_predict=rf1.predict(X_train_smote)
rf1_train_predict

array([2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1,
       1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
       2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2,
       1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2,
       1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2,
       1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1,
       1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2,

In [32]:
# Testing The model

rf1_test_predict=rf1.predict(X_test)
rf1_test_predict


array([1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1,
       2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1],
      dtype=int64)

# 7.4 K Nearest Neighbors Classifier

In [33]:
#initializing the model

knn=KNeighborsClassifier()

In [34]:
# hyper parameter tuning the model

parameters={
           "n_neighbors":[5,7,9]
           }

gsknn=GridSearchCV(estimator=knn,param_grid=parameters)

gsknn.fit(X_train_smote,y_train_smote)

gsknn.best_score_

0.6888019701579023

In [35]:
gsknn.best_params_ # finding the best parameters for this model

{'n_neighbors': 5}

In [36]:
knn1=KNeighborsClassifier(n_neighbors=5)

In [37]:
#Training the model

knn1.fit(X_train_smote,y_train_smote)
knn1_train_predict=knn1.predict(X_train_smote)
knn1_train_predict

array([2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2,
       2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1,
       2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 2,
       1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1,
       2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
       2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2,
       1, 1, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2,
       2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2,
       1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 1,
       2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2,

In [38]:
# Testing The model

knn1_test_predict=knn1.predict(X_test)
knn1_test_predict

array([1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 2,
       1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2,
       1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1,
       2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1,
       2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2],
      dtype=int64)

# 7.5. Naive Bayes Algorithm

In [39]:
#initializing the model

nb1=GaussianNB()

In [40]:
#Training the model

nb1.fit(X_train_smote,y_train_smote)
nb1_train_predict=nb1.predict(X_train_smote)
nb1_train_predict

array([1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2,
       1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2,
       2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2,
       2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2,
       2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2,
       2, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2,
       2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2,
       2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2,
       1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1,
       2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2,

In [41]:
# Testing The model

nb1_test_predict=nb1.predict(X_test)
nb1_test_predict

array([1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2,
       2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2,
       1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2,
       1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2],
      dtype=int64)

# 7.6. Support Vector Machine

In [42]:
#initializing the model

svc=SVC()

In [43]:
# hyper parameter tuning the model

parameters={
           "kernel":['linear','rbf'],
           "random_state":[0,1,42]
           }

gssvc=GridSearchCV(estimator=svc,param_grid=parameters)

gssvc.fit(X_train_smote,y_train_smote)

gssvc.best_score_

0.6479067072287411

In [44]:
gssvc.best_params_ # finding the best parameters for this model

{'kernel': 'rbf', 'random_state': 0}

In [45]:
svc1=SVC(kernel='rbf', random_state= 0)

In [46]:
#Training the model

svc1.fit(X_train_smote,y_train_smote)
svc1_train_predict=svc1.predict(X_train_smote)
svc1_train_predict

array([2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1,
       2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2,
       2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1,
       2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2,
       1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2,
       1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2,
       2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2,
       1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1,
       2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2,

In [47]:
# Testing The model

svc1_test_predict=svc1.predict(X_test)
svc1_test_predict

array([1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2,
       2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2,
       1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2,
       1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2,
       2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1,
       1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2],
      dtype=int64)

# 8. Evaluating the performance of the models

In [48]:
# The models are evaluated using
# 1. accuracy score
# 2. confusion matrix
# 3. classification report#

# 8.1. Accuracy score

In [49]:
# logistic regression Algorithm

print("1. logistic regression Algorithm\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,lr1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,lr1_test_predict))

1. logistic regression Algorithm

On Training data:  0.6887755102040817
On Testing Data:  0.6857142857142857


In [50]:
# Decision Tree Classifier

print("2. Decision Tree Classifier\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,dt1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,dt1_test_predict))

2. Decision Tree Classifier

On Training data:  1.0
On Testing Data:  0.6


In [51]:
# Random Forest Classifier

print("3. Random Forest Classifier\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,rf1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,rf1_test_predict))

3. Random Forest Classifier

On Training data:  1.0
On Testing Data:  0.6971428571428572


In [52]:
# K Nearest Neighbors

print("4. K Nearest Neighbors\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,knn1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,knn1_test_predict))

4. K Nearest Neighbors

On Training data:  0.8248299319727891
On Testing Data:  0.6228571428571429


In [53]:
# Naive Bayes

print("5. Naive Bayes\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,nb1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,nb1_test_predict))

5. Naive Bayes

On Training data:  0.6649659863945578
On Testing Data:  0.5714285714285714


In [54]:
# Support Vector Machine

print("6. Support Vector Machine\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,svc1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,svc1_test_predict))

6. Support Vector Machine

On Training data:  0.6904761904761905
On Testing Data:  0.6285714285714286


# 8.2. confusion matrix

In [55]:
# logistic regression Algorithm

print("1. logistic regression Algorithm")
confusion_matrix(y_test,lr1_test_predict)

1. logistic regression Algorithm


array([[78, 44],
       [11, 42]], dtype=int64)

In [56]:
# Decision Tree Classifier

print("2. Decision Tree Classifier")
confusion_matrix(y_test,dt1_test_predict)

2. Decision Tree Classifier


array([[86, 36],
       [34, 19]], dtype=int64)

In [57]:
# Random Forest Classifier

print("3. Random Forest Classifier")
confusion_matrix(y_test,rf1_test_predict)

3. Random Forest Classifier


array([[100,  22],
       [ 31,  22]], dtype=int64)

In [58]:
# K Nearest Neighbors

print("4. K Nearest Neighbors")
confusion_matrix(y_test,knn1_test_predict)

4. K Nearest Neighbors


array([[79, 43],
       [23, 30]], dtype=int64)

In [59]:
# Naive Bayes

print("5. Naive Bayes")
confusion_matrix(y_test,nb1_test_predict)

5. Naive Bayes


array([[47, 75],
       [ 0, 53]], dtype=int64)

In [60]:
# Support Vector Machine

print("6. Support Vector Machine")
confusion_matrix(y_test,svc1_test_predict)

6. Support Vector Machine


array([[61, 61],
       [ 4, 49]], dtype=int64)

# 8.3. Classification Report

In [61]:
# logistic regression Algorithm

print("1. logistic regression Algorithm")
print(classification_report(y_test,lr1_test_predict))

1. logistic regression Algorithm
              precision    recall  f1-score   support

           1       0.88      0.64      0.74       122
           2       0.49      0.79      0.60        53

    accuracy                           0.69       175
   macro avg       0.68      0.72      0.67       175
weighted avg       0.76      0.69      0.70       175



In [62]:
# Decision Tree Classifier

print("2. Decision Tree Classifier")
print(classification_report(y_test,dt1_test_predict))

2. Decision Tree Classifier
              precision    recall  f1-score   support

           1       0.72      0.70      0.71       122
           2       0.35      0.36      0.35        53

    accuracy                           0.60       175
   macro avg       0.53      0.53      0.53       175
weighted avg       0.60      0.60      0.60       175



In [63]:
# Random Forest Classifier

print("3.  Random Forest Classifier")
print(classification_report(y_test,rf1_test_predict))

3.  Random Forest Classifier
              precision    recall  f1-score   support

           1       0.76      0.82      0.79       122
           2       0.50      0.42      0.45        53

    accuracy                           0.70       175
   macro avg       0.63      0.62      0.62       175
weighted avg       0.68      0.70      0.69       175



In [64]:
# K Nearest Neighbors

print("4. K Nearest Neighbors")
print(classification_report(y_test,knn1_test_predict))

4. K Nearest Neighbors
              precision    recall  f1-score   support

           1       0.77      0.65      0.71       122
           2       0.41      0.57      0.48        53

    accuracy                           0.62       175
   macro avg       0.59      0.61      0.59       175
weighted avg       0.66      0.62      0.64       175



In [65]:
# Naive Bayes

print("5. Naive Bayes")
print(classification_report(y_test,nb1_test_predict))

5. Naive Bayes
              precision    recall  f1-score   support

           1       1.00      0.39      0.56       122
           2       0.41      1.00      0.59        53

    accuracy                           0.57       175
   macro avg       0.71      0.69      0.57       175
weighted avg       0.82      0.57      0.57       175



In [66]:
# Support Vector Machine

print("6. Support Vector Machine")
print(classification_report(y_test,svc1_test_predict))

6. Support Vector Machine
              precision    recall  f1-score   support

           1       0.94      0.50      0.65       122
           2       0.45      0.92      0.60        53

    accuracy                           0.63       175
   macro avg       0.69      0.71      0.63       175
weighted avg       0.79      0.63      0.64       175



# 9. Selecting and saving the model

In [67]:
# From the performance of each model "Naive Bayes" gives the best performance
# so this algorithm is used to predict the result for our model

# Saving the model

from joblib import Parallel, delayed
import joblib

# save the model as a pickle in a file
joblib.dump(nb1, 'selected_model1.pkl')

['selected_model1.pkl']