# Assignment – Model Selection

# 1. Load the dataset into python environment

In [1]:
import pandas as pd
import numpy as np

In [2]:
##Load the dataset
titanic_data=pd.read_csv(r"C:\dsa\titanic_dataset  (1).csv")
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#Basic information
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
#shape of dataset
titanic_data.shape

(891, 12)

In [5]:
# Statistical analysis
titanic_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Preprocessing

In [6]:
# 1.Remove the unneccessary columns

titanic_data.PassengerId.nunique()

891

In [15]:
titanic_data.drop(['PassengerId'],axis=1,inplace=True)

In [16]:
# 2.Check for null values
titanic_data.isna().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [17]:
# Imputation

mean_age = titanic_data.Age.mean()
mode_cabin = titanic_data.Cabin.mode()
mode_embarked = titanic_data.Embarked.mode()

print('mean_age: ',mean_age)
print('mode_cabin: ',mode_cabin)
print('mode_embarked: ',mode_embarked)

mean_age:  29.69911764705882
mode_cabin:  0    G6
Name: Cabin, dtype: object
mode_embarked:  0    S
Name: Embarked, dtype: object


In [18]:
titanic_data['Age'] = titanic_data['Age'].fillna(29.69911764705882)
titanic_data['Cabin'] = titanic_data['Cabin'].fillna('G6')
titanic_data['Embarked'] = titanic_data['Embarked'].fillna('S')

titanic_data.isna().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [19]:
# 3.Check for duplicates
titanic_data[titanic_data.duplicated()]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [13]:
# 4.Check for Outliers

def outliers(titanic_data,col,threshold):
    z_score = ((col-col.mean())/col.std())
    outlier_val = (abs(z_score)>threshold)
    outlier_data = titanic_data[outlier_val]
    return outlier_data

In [20]:
titanic_data.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [21]:
age_outlier = outliers(titanic_data,titanic_data.Age,2.5)
fare_outlier = outliers(titanic_data,titanic_data.Fare,2.5)


In [22]:
# Remove outliers

def remove_outliers(data,column,threshold):
    z_scr = (column-column.mean())/column.std()
    non_outliers = (abs(z_scr)<=threshold)
    df_no_outliers = data[non_outliers]
    return df_no_outliers

In [23]:
cleaned_age = remove_outliers(titanic_data,titanic_data.Age,2.5)
data = remove_outliers(cleaned_age,titanic_data.Fare,2.5)

  df_no_outliers = data[non_outliers]


In [24]:
titanic_data.shape

(891, 11)

In [25]:
# 5.Encoding

titanic_data = pd.get_dummies(titanic_data)

# Creating feature and target variables

In [27]:
X = titanic_data.drop(['Survived'],axis=1)
y = titanic_data['Survived']

print(X.shape)
print(y.shape)

(891, 1729)
(891,)


In [32]:
#Scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(X)

# Creating Training and Testing dataset

In [33]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state=42)

# Building the models

In [38]:

# 1. K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier

model_1 = KNeighborsClassifier()
knn_model = model_1.fit(X_train,y_train)
knn_pred = knn_model.predict(X_test)

In [39]:
# 2. Support Vector Machine
from sklearn.svm import SVC

model_2 = SVC()
sv_classifier = model_2.fit(X_train,y_train)
svc_pred = model_2.predict(X_test)

# Model Evaluation

In [40]:
from sklearn.metrics import accuracy_score,classification_report

knn_accuracy = accuracy_score(y_test,knn_pred)
knn_report = classification_report(y_test,knn_pred)
print('KNN Accuracy :\n',knn_accuracy)
print('KNN Classification Report :\n',knn_report)

svc_accuracy = accuracy_score(y_test,svc_pred)
svc_report = classification_report(y_test,svc_pred)
print('SVM Accuracy :\n',svc_accuracy)
print('SVM Classification Report :\n',svc_report)

KNN Accuracy :
 0.4304932735426009
KNN Classification Report :
               precision    recall  f1-score   support

           0       0.60      0.16      0.25       134
           1       0.40      0.84      0.54        89

    accuracy                           0.43       223
   macro avg       0.50      0.50      0.40       223
weighted avg       0.52      0.43      0.37       223

SVM Accuracy :
 0.6322869955156951
SVM Classification Report :
               precision    recall  f1-score   support

           0       0.62      1.00      0.77       134
           1       1.00      0.08      0.15        89

    accuracy                           0.63       223
   macro avg       0.81      0.54      0.46       223
weighted avg       0.77      0.63      0.52       223



# Cross Validation

In [44]:
# 1.KFold

from sklearn.model_selection import KFold,StratifiedKFold,cross_val_score

kfold = KFold(n_splits=5,shuffle=True,random_state=0)

knn_score = cross_val_score(knn_model,X,y,cv=kfold)
svc_score = cross_val_score(sv_classifier,X,y,cv=kfold)
mean_knn_scr = knn_score.mean()
mean_svc_scr = svc_score.mean()

print('KNN Mean Cross Validation Score :',mean_knn_scr)
print('SVM Mean Cross Validation Score :',mean_svc_scr)

KNN Mean Cross Validation Score : 0.4209089197162764
SVM Mean Cross Validation Score : 0.6442156801205198


In [45]:
# 2.Stratified KFold

skfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=0)

knn_strat_score = cross_val_score(knn_model,X,y,cv=skfold)
svc_strat_score = cross_val_score(sv_classifier,X,y,cv=skfold)
avg_knn_scr = knn_strat_score.mean()
avg_svc_scr = svc_strat_score.mean()

print('KNN Mean Cross Validation Score :',avg_knn_scr)
print('SVM Mean Cross Validation Score :',avg_svc_scr)

KNN Mean Cross Validation Score : 0.41303119703722296
SVM Mean Cross Validation Score : 0.6487226162827192


In [None]:
##Inference : Using Holdout,KFold and Stratified KFold techniques,the accuracy scores of Support Vector Machine is higher than that of K-Nearest Neighbors.So,its clear that the Support Vector Machine has a better performance than the K-Nearest Neighbors.