In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('titanic_train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.shape
# rows = 891, cols = 12

(891, 12)

#### Handling the Null values

In [7]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [10]:
df['Age'].head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [13]:
print(df['Age'].mean())
print(df['Age'].median())

29.69911764705882
28.0


In [14]:
df['Age'].fillna(df['Age'].median(),inplace=True)

In [15]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [16]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [17]:
df['Embarked'].fillna(df['Embarked'].mode().max(),inplace=True)

In [19]:
df.shape

(891, 12)

In [20]:
687/891

0.7710437710437711

In [18]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [21]:
df.drop('Cabin',axis=1,inplace=True)
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [22]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


#### Columns which are not significant
1) PassengerId<br>
2) Name<br>
3) Ticket<br>

In [23]:
df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [24]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [25]:
df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [27]:
from sklearn.preprocessing import LabelEncoder

In [29]:
lb = LabelEncoder()

In [30]:
df['Sex'] = lb.fit_transform(df['Sex'])
df['Embarked'] = lb.fit_transform(df['Embarked'])

In [31]:
df.dtypes

Survived      int64
Pclass        int64
Sex           int32
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked      int32
dtype: object

In [32]:
x = df.drop('Survived',axis=1)
y = df['Survived']
print(x.shape)
print(y.shape)

(891, 7)
(891,)


#### Split the data into Training and test data

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(668, 7)
(223, 7)
(668,)
(223,)


In [37]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [43]:
def gen_model(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    print('Training Score',model.score(x_train,y_train))
    print('Testing Score',model.score(x_test,y_test))
    ypred = model.predict(x_test)
    print(ypred)
    cm = confusion_matrix(y_test,ypred)
    print(cm)
    print('Accuracy Score',accuracy_score(y_test,ypred))
    print(classification_report(y_test,ypred))

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [45]:
cls_models = [LogisticRegression(max_iter=1000),KNeighborsClassifier(n_neighbors=15),
             DecisionTreeClassifier(criterion='entropy',max_depth=8,min_samples_split=15),
             RandomForestClassifier(n_estimators=90,criterion='entropy',max_depth=10,min_samples_split=15),
             SVC(kernel='linear',C=1)]

In [46]:
for i in cls_models:
    gen_model(i,x_train,x_test,y_train,y_test)
    print('*'*70)

Training Score 0.8053892215568862
Testing Score 0.7443946188340808
[0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 1 1 1 0 0 1 1 0 1 0 0 1 0 1 0 1 0 0 1 1 1 0 1 0 1 0 0 0 0 0 1 1 0 1 0
 1 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 1 1 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 1 1
 1 1 1 0 0 1 0 0 1 1 0 1 0 0 1 1 1 0 1 1 1 0 1 0 1 0 1 0 0 0 0 0 0 1 0 1 0
 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 1 1 0 0 1 1 1 1 0 1 0
 1]
[[115  36]
 [ 21  51]]
Accuracy Score 0.7443946188340808
              precision    recall  f1-score   support

           0       0.85      0.76      0.80       151
           1       0.59      0.71      0.64        72

    accuracy                           0.74       223
   macro avg       0.72      0.73      0.72       223
weighted avg       0.76      0.74      0.75       223

**********************************************************************
Training Score 0.7260479