<h1 style="text-align:center">Titanic Dataset Model Testing</h1>



<h2>Importing necessary libraries</h2>

In [1]:
#importing libraries
import numpy
import pandas as pd
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB


from sklearn.model_selection import GridSearchCV

<h2>Importing the testing Titanic dataset</h2>

In [2]:
#importing dataset
dataset = pd.read_csv('test.csv')
print(dataset.head())

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  


<h2>Preprocessing</h2>
This section is about data preprocessing

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [4]:
dataset.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

<h3>Filling the missing value of Age in dataset:</h3>

In [7]:
age_by_pclass_sex = dataset.groupby(['Sex', 'Pclass']).median()['Age']

for pclass in range(1, 4):
    for sex in ['female', 'male']:
        print('Median age of Pclass {} {}s: {}'.format(pclass, sex, age_by_pclass_sex[sex][pclass]))

print('Median age of all passengers: {}'.format(dataset['Age'].median()))

dataset['Age'] = dataset.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

Median age of Pclass 1 females: 41.0
Median age of Pclass 1 males: 42.0
Median age of Pclass 2 females: 24.0
Median age of Pclass 2 males: 28.0
Median age of Pclass 3 females: 22.0
Median age of Pclass 3 males: 24.0
Median age of all passengers: 27.0


In [8]:
dataset['Age'].isnull().sum()


0

<h3>Creating new column i.e if they were travelling alone</h3>

In [9]:
dataset['Alone'] = dataset['SibSp'] + dataset['Parch']
dataset['Alone'] = dataset['Alone'].apply(lambda x: 0 if x > 0 else 1)
dataset.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Alone,Married
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,1
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S,1,0
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,1,0
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S,0,0
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,1,1
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S,0,0


<h3>Creating a new column i.e Married or unmarried</h3>

In [10]:
def married(name):
    if 'Master.' in name:
        return 1
    if 'Mrs.' in name:
        return 1
    else:
        return 0

dataset['Married'] = dataset['Name'].apply(married)
dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Alone,Married
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,1


<h3>dropping unnecessary columns</h3>

In [11]:
dataset.drop('Fare', axis = 1, inplace = True)
dataset.drop('SibSp', axis = 1, inplace = True)
dataset.drop('Parch', axis = 1, inplace = True)
dataset.drop('PassengerId', axis = 1, inplace = True)
dataset.drop('Name',axis=1,inplace=True)
dataset.drop('Cabin',axis=1,inplace=True)
dataset.drop('Ticket',axis=1,inplace=True)



In [12]:
dataset.head()


Unnamed: 0,Pclass,Sex,Age,Embarked,Alone,Married
0,3,male,34.5,Q,1,0
1,3,female,47.0,S,0,1
2,2,male,62.0,Q,1,0
3,3,male,27.0,S,1,0
4,3,female,22.0,S,0,1


In [13]:
def embarked(emb):
    if emb == 'S':
        return 1
    if emb == 'C':
        return 2
    if emb == 'Q':
        return 3
    else: #for null data returning S i.e 1 as it has greatest value
        return 1

dataset['Embarked'] = dataset['Embarked'].apply(embarked)

In [14]:
dataset.head()

Unnamed: 0,Pclass,Sex,Age,Embarked,Alone,Married
0,3,male,34.5,3,1,0
1,3,female,47.0,1,0,1
2,2,male,62.0,3,1,0
3,3,male,27.0,1,1,0
4,3,female,22.0,1,0,1


Changing male femlae to binary for model training


In [15]:
dataset['Sex'] = dataset['Sex'].apply(lambda x: 1 if x == 'female' else 0 )

In [16]:
embarked_df = pd.get_dummies(dataset['Embarked'])
dataset = pd.concat([dataset,embarked_df], axis = 1)

dataset.drop('Embarked', axis = 1, inplace = True)
dataset.head()

Unnamed: 0,Pclass,Sex,Age,Alone,Married,1,2,3
0,3,0,34.5,1,0,0,0,1
1,3,1,47.0,0,1,1,0,0
2,2,0,62.0,1,0,0,0,1
3,3,0,27.0,1,0,1,0,0
4,3,1,22.0,0,1,1,0,0


since features can't be in integer changing 1,2,3 back to C,Q,S

In [17]:
for i in range(1,4):
    dataset.rename(columns={i:f"port_{i}"},inplace=True)

In [18]:
dataset.head()


Unnamed: 0,Pclass,Sex,Age,Alone,Married,port_1,port_2,port_3
0,3,0,34.5,1,0,0,0,1
1,3,1,47.0,0,1,1,0,0
2,2,0,62.0,1,0,0,0,1
3,3,0,27.0,1,0,1,0,0
4,3,1,22.0,0,1,1,0,0


<h2>importing model</h2>

In [19]:
dtc_pickle=open('decision_tree_acc_84.pickle','rb')
dtree_model = pickle.load(dtc_pickle)

In [20]:
x_test = dataset

In [21]:
x_test.shape

(418, 8)

In [22]:
predict = dtree_model.predict(x_test)

In [23]:
print(predict)

[0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 0 0 1 0 0 1]
