# Classification Algorithms and Model Evaluation

In this notebook, we will cover:

* Logistic Regression
* Confusion Matrix
* Precision, Recall, Accuracy, F1 Score
* ROC AUC Curve
* Deciding Binary Classifier threshold
* KNN

Importing all necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

Setting global seed of notebook

### Task 1: Load Data from 'titanic_clean.csv'

In [2]:
#write code here
data = pd.read_csv('titanic_clean.csv')
df = data.copy()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Embarked,Title,GrpSize,FareCat,AgeCat
0,1,0,3,male,1,0,S,Mr,couple,0-10,16-32
1,2,1,1,female,1,0,C,Mrs,couple,70-100,32-48
2,3,1,3,female,0,0,S,Miss,solo,0-10,16-32
3,4,1,1,female,1,0,S,Mrs,couple,40-70,32-48
4,138,0,1,male,1,0,S,Mr,couple,40-70,32-48


### One Hot encoding for categorical varaibles

In [3]:
df_OneHot=pd.get_dummies(df,columns=['Pclass','Sex','Embarked','Title','GrpSize','FareCat','AgeCat'])
df_OneHot.head()

Unnamed: 0,PassengerId,Survived,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,...,FareCat_100+,FareCat_25-40,FareCat_40-70,FareCat_70-100,FareCat_Oct-25,AgeCat_0-16,AgeCat_16-32,AgeCat_32-48,AgeCat_48-64,AgeCat_64+
0,1,0,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,2,1,1,0,1,0,0,1,0,1,...,0,0,0,1,0,0,0,1,0,0
2,3,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,4,1,1,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
4,138,0,1,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0


In [4]:
df_OneHot.shape

(891, 32)

In [5]:
df=df_OneHot.copy()

### Task 2: Create Independent and Dependent Variables

In [6]:
#write code here
X = df.drop(['PassengerId','Survived'], axis=1)
Y = df['Survived']

### Task 3: Train Test n Split the data

In [7]:
# Import the library
from sklearn.model_selection import train_test_split

In [8]:
#Write the code here
xtrain, xtest, ytrain, ytest = train_test_split(X,Y, test_size=0.30, random_state=100)
print(xtrain.shape, ytrain.shape)
print(xtest.shape, ytest.shape)

(623, 30) (623,)
(268, 30) (268,)


In [9]:
xtrain.head()

Unnamed: 0,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,...,FareCat_100+,FareCat_25-40,FareCat_40-70,FareCat_70-100,FareCat_Oct-25,AgeCat_0-16,AgeCat_16-32,AgeCat_32-48,AgeCat_48-64,AgeCat_64+
69,1,0,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
85,4,1,0,0,1,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
794,0,0,0,1,0,1,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0
161,1,0,1,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,1,0,0
815,0,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [10]:
ytrain.head()

69     0
85     0
794    1
161    0
815    0
Name: Survived, dtype: int64

# 1. Logistic Regression

### Creating Model & Training

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
lr_model = LogisticRegression(random_state=100)

In [13]:
lr_model.fit(xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=100, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Evaluation 

In [14]:
pred = lr_model.predict(xtest)

In [15]:
pred[0:9]

array([0, 0, 1, 0, 0, 1, 1, 0, 0], dtype=int64)

Predicting prabability of **0** and **1**

In [16]:
pred_prb = lr_model.predict_proba(xtest)

In [17]:
pred_prb[0:9,0:9]

array([[0.89649692, 0.10350308],
       [0.8452458 , 0.1547542 ],
       [0.478133  , 0.521867  ],
       [0.81454842, 0.18545158],
       [0.57841495, 0.42158505],
       [0.46508821, 0.53491179],
       [0.03565726, 0.96434274],
       [0.52851382, 0.47148618],
       [0.86700973, 0.13299027]])

First value in Numpy array is probability of **0** and second is probability of **1**

Only predicting and extracting probability values of **1**

In [18]:
lr_pred_prb = lr_model.predict_proba(xtest)[:,1]

In [19]:
lr_pred_prb

array([0.10350308, 0.1547542 , 0.521867  , 0.18545158, 0.42158505,
       0.53491179, 0.96434274, 0.47148618, 0.13299027, 0.1214114 ,
       0.10350308, 0.10350308, 0.1934902 , 0.13010479, 0.19583434,
       0.18545158, 0.95211531, 0.11793283, 0.94657764, 0.03411047,
       0.10350308, 0.65302095, 0.97139129, 0.43786706, 0.10350308,
       0.16094368, 0.36711735, 0.41788508, 0.35738721, 0.88037761,
       0.73938717, 0.16939221, 0.96838607, 0.78285758, 0.92088489,
       0.32377819, 0.15625644, 0.18545158, 0.81205472, 0.15625644,
       0.3101409 , 0.50698397, 0.65302095, 0.18545158, 0.10350308,
       0.89067359, 0.10350308, 0.96574897, 0.5623926 , 0.6294708 ,
       0.56975266, 0.71296888, 0.80570548, 0.12625178, 0.26277429,
       0.88008365, 0.78774703, 0.14087049, 0.18545158, 0.10259746,
       0.18545158, 0.73148121, 0.76201091, 0.13010479, 0.10350308,
       0.00909933, 0.95023936, 0.01429503, 0.73261385, 0.10350308,
       0.064324  , 0.64642098, 0.10350308, 0.92891781, 0.73154

### Comparison of Predicted and Actual

In [20]:
xtest.head()

Unnamed: 0,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,...,FareCat_100+,FareCat_25-40,FareCat_40-70,FareCat_70-100,FareCat_Oct-25,AgeCat_0-16,AgeCat_16-32,AgeCat_32-48,AgeCat_48-64,AgeCat_64+
205,0,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
44,4,2,0,0,1,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
821,0,0,1,0,0,0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,0
458,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,1,0,1,0,0,0
795,1,0,1,0,0,0,1,0,0,1,...,0,0,1,0,0,0,1,0,0,0


In [21]:
xt = xtest.copy()
xt['pred'] = pred
xt['pred_probability'] = lr_pred_prb
xt['actual'] = ytest
xt.head(20)

Unnamed: 0,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,...,FareCat_70-100,FareCat_Oct-25,AgeCat_0-16,AgeCat_16-32,AgeCat_32-48,AgeCat_48-64,AgeCat_64+,pred,pred_probability,actual
205,0,0,0,0,1,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0.103503,0
44,4,2,0,0,1,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0.154754,0
821,0,0,1,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,1,0.521867,0
458,0,0,0,1,0,0,1,0,0,1,...,0,1,0,1,0,0,0,0,0.185452,0
795,1,0,1,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0.421585,0
118,0,0,1,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,1,0.534912,0
424,1,0,1,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,1,0.964343,1
678,1,1,1,0,0,0,1,1,0,0,...,1,0,0,0,0,1,0,0,0.471486,1
269,0,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0.13299,0
229,0,0,0,1,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0.121411,0


### Confusion Matrix Play ground

In [22]:
from sklearn.metrics import confusion_matrix

In [23]:
confusion_matrix(ytest, pred)

array([[144,  28],
       [ 14,  82]], dtype=int64)

In [24]:
confusion_matrix(ytest, pred).ravel()

array([144,  28,  14,  82], dtype=int64)

In [25]:
tn, fp, fn, tp = confusion_matrix(ytest, pred).ravel()
conf_matrix=pd.DataFrame({"pred_Survived":[tp,fp],"pred_Not Survived":[fn,tn]},index=["Survived","Not Survived"])
conf_matrix

Unnamed: 0,pred_Survived,pred_Not Survived
Survived,82,14
Not Survived,28,144


### Accuracy

In [26]:
ytrain.value_counts()

0    377
1    246
Name: Survived, dtype: int64

In [27]:
243/(380+243)

0.3900481540930979

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
accuracy = (tp + tn) / (tp + fp + tn + fn)
print("Accuracy: {}".format(accuracy))

Accuracy: 0.8432835820895522


In [30]:
accuracy_lr = accuracy_score(ytest,pred)
print("Accuracy by built-in function: {}".format(accuracy_lr))

Accuracy by built-in function: 0.8432835820895522


### Precision

In [31]:
from sklearn.metrics import precision_score

In [32]:
precision_1 = tp / (tp + fp)
print("Precision for 1: {}".format(precision_1))
precision_0 = tn / (tn + fn)
print("Precision for 0: {}".format(precision_0))

Precision for 1: 0.7454545454545455
Precision for 0: 0.9113924050632911


In [33]:
precision_lr = precision_score(ytest,pred)
print("Precision by built-in function: {}".format(precision_lr))

Precision by built-in function: 0.7454545454545455


### Recall

In [34]:
from sklearn.metrics import recall_score

In [35]:
recall_1 = tp / (tp + fn)
print("Recall for 1: {}".format(recall_1))
recall_0 = tn / (tn + fp)
print("Recall for 0: {}".format(recall_0))

Recall for 1: 0.8541666666666666
Recall for 0: 0.8372093023255814


In [36]:
recall_lr = recall_score(ytest,pred)
print("Recall by built-in function: {}".format(recall_lr))

Recall by built-in function: 0.8541666666666666


### F1 Score

In [37]:
from sklearn.metrics import f1_score

In [38]:
f1_1 = (2 * precision_1 * recall_1) / (precision_1 + recall_1)
print("F1 Score for 1: {}".format(f1_1))
f1_0 = (2 * precision_0 * recall_0) / (precision_0 + recall_0)
print("F1 Score for 0: {}".format(f1_0))

F1 Score for 1: 0.7961165048543689
F1 Score for 0: 0.8727272727272727


In [39]:
f1_lr=f1_score(ytest,pred)
print("F1 Score by built-in function: {}".format(f1_lr))

F1 Score by built-in function: 0.7961165048543689


### Class Distribution in Training Data

In [40]:
ytrain.value_counts()

0    377
1    246
Name: Survived, dtype: int64

### Classification Report

In [41]:
from sklearn.metrics import classification_report

In [42]:
#get report of precision recall and f1 score
print(classification_report(ytest,pred))
#macro avg 

              precision    recall  f1-score   support

           0       0.91      0.84      0.87       172
           1       0.75      0.85      0.80        96

    accuracy                           0.84       268
   macro avg       0.83      0.85      0.83       268
weighted avg       0.85      0.84      0.85       268



In [43]:
import helper_confusion_matrix as helper
helper.conf_matrix(ytest,lr_pred_prb, threshold=0.4)

              pred_Survived  pred_Not Survived
Survived                 84                 12
Not Survived             39                133
Accuracy: 80.970
Precision : 68.293
Recall: 87.500
F1 Score : 76.712
TPR: 87.500  FPR: 22.674


### ROC AUC Curve

_Receiver Operating Characteristic_ & _Area Under Curve_

In [None]:
tpr = recall_lr
fpr = fp / (fp + tn)

In [None]:
tpr, fpr

In [None]:
fpr = 1 - recall_0
tpr, fpr

Defining function to plot ROC AUC Curve

In [None]:
from sklearn.metrics import auc,roc_curve,roc_auc_score

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(8,6))
    plt.title('ROC Curve')
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.005, 1, 0, 1.005])
    plt.xticks(np.arange(0,1, 0.05), rotation=90)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc='best')

Getting TPR, FPR values for each threshold on ROC AUC Curve

In [None]:
fpr,tpr,threshold=roc_curve(ytest,lr_pred_prb)

Calculating AUC score from ytest and predicted probabilities

In [None]:
auc_lr=roc_auc_score(ytest,lr_pred_prb)
auc_lr

Plotting AUC ROC Curve

In [None]:
sns.set_context('poster')
plot_roc_curve(fpr,tpr,label='AUC = %0.3f'% auc_lr)

## Model Complexity

In [None]:
from sklearn.preprocessing import PolynomialFeatures
acc_train=[]
acc_test=[]

for i in range(1,6):
    poly_reg = PolynomialFeatures(degree=i)
    
    X_tr_poly,X_tst_poly= poly_reg.fit_transform(xtrain),poly_reg.fit_transform(xtest)
    
    lr_poly = LogisticRegression(random_state=25)
    lr_poly.fit(X_tr_poly, ytrain)
   
    y_tr_predicted,y_tst_predict = lr_poly.predict(X_tr_poly),lr_poly.predict(X_tst_poly)
   
    acc_train.append(accuracy_score(ytrain, y_tr_predicted))
    acc_test.append(accuracy_score(ytest, y_tst_predict))
    

In [None]:
plt.figure(figsize=(18,5))
sns.set_context('poster')

sns.lineplot(x=list(range(1,6)), y=acc_train, label='Training')

sns.lineplot(x=list(range(1,6)), y=acc_test, label='Testing')

# 2. K Nearest Neighbors (KNN)

### Task 4: Create Independent and Dependent Variables

In [None]:
#write code here
X = df.drop(['PassengerId','Survived'], axis=1)
Y = df['Survived']

### Task 5: Train test and split the dataset

In [None]:
#write code here
xtrain, xtest, ytrain, ytest = train_test_split(X,Y, test_size=0.3, random_state=25, shuffle=True)
print(xtrain.shape, ytrain.shape)
print(xtest.shape, ytest.shape)

For KNN, we need to stadardize data first

In [None]:
from sklearn.preprocessing import StandardScaler 

In [None]:
scaler = StandardScaler()  
scaler.fit(xtrain)
X_train_=scaler.transform(xtrain)
X_test_=scaler.transform(xtest)
X_train=pd.DataFrame(data=X_train_, columns=xtrain.columns)
X_test=pd.DataFrame(data=X_test_, columns=xtest.columns)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
clf_knn = KNeighborsClassifier(n_neighbors=10)

In [None]:
clf_knn.fit(X_train,ytrain)

In [None]:
knn_pred=clf_knn.predict(X_test)
knn_pred_prb=clf_knn.predict_proba(X_test)[:,1]

In [None]:
accuracy_train=accuracy_score(ytrain,clf_knn.predict(X_train))
print("Accuracy Train: {}".format(accuracy_train))

In [None]:
accuracy_knn = accuracy_score(ytest,knn_pred)
print("Accuracy : {}".format(accuracy_knn))

In [None]:
print(classification_report(ytest,knn_pred))

In [None]:
fpr,tpr,threshold=roc_curve(ytest,knn_pred_prb)

In [None]:
auc_knn=roc_auc_score(ytest,knn_pred_prb)
auc_knn

In [None]:
sns.set_context('poster')
plot_roc_curve(fpr,tpr,label='AUC = %0.3f'% auc_knn)

### Model Complexity

In [None]:
accuracy_train=[]
accuracy_test=[]
for i in range(1,26):
    cnn_model = KNeighborsClassifier(n_neighbors=i)
    cnn_model.fit(X_train,ytrain)
   
    knn_pred=cnn_model.predict(X_test)
    knn_pred_prb=cnn_model.predict_proba(X_test)[:,1]
   
    accuracy_train.append(accuracy_score(ytrain,cnn_model.predict(X_train)))
    accuracy_test.append(accuracy_score(ytest,knn_pred))



In [None]:
plt.figure(figsize=(18,5))
sns.set_context('poster')

sns.lineplot(x=list(range(1,26)), y=accuracy_train, label='Training')

sns.lineplot(x=list(range(1,26)), y=accuracy_test, label='Testing')
plt.xlabel('Number of Neighbours')
plt.ylabel('Accuracy Score')

# 3. Submission on Kaggle

In [None]:
list(zip(accuracy_train, accuracy_test))

### Task 6: Import test data

In [None]:
#write code here
test = pd.read_csv('competition_clean.csv')
df_test = test.copy()
df_test.head()

### Task 7: Do One Hot encoding of test data

In [None]:
#write code here
df_OneHot= pd.get_dummies(df_test, columns=['Pclass','Sex','Embarked','Title','GrpSize','FareCat','AgeCat'])
df_OneHot.head()
df_test=df_OneHot.copy()

### Task 8: Separate Passenger ID for submission

In [None]:
#Write code here
PassengerID=df_test['PassengerId']
df_test.drop(['PassengerId'], axis=1, inplace=True)

In [None]:
df_test.shape

### Task 9: Do prediction through final model

In [None]:
#write code here
pred_final=clf_knn.predict(df_test)

#### Creating Data Frame for submission

In [None]:
submission=pd.DataFrame({'PassengerId':PassengerID,'Survived':pred_final})

In [None]:
submission.head()

### Task 10: Export the dataset into csv file

In [None]:
# Write code here
submission.to_csv('my_submission v1.0.csv', index=False)