In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Preprocessing Data

In [5]:
mode_embarked = df['Embarked'].mode()[0]
mode_embarked

'S'

In [6]:
df['Embarked'] = df['Embarked'].fillna(mode_embarked)

In [9]:
median_age = df['Age'].median()
median_age

28.0

In [10]:
df['Age'] = df['Age'].fillna(median_age)

In [12]:
mean_fare = df['Fare'].median()
mean_fare

14.4542

In [14]:
df.drop('Cabin', axis=1, inplace=True)
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

# Convert Data to Numberic

In [17]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [19]:
embarked = df[['Embarked']]
dummies = pd.get_dummies(embarked)

In [20]:
dummies.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [22]:
df = pd.merge(df.reset_index(), dummies.reset_index())

In [23]:
df = df.drop(['index', 'Embarked'], axis=1)

In [24]:
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, 
                 "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
                 "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }

df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)
df['Title'] = df['Title'].map(title_mapping)
df.head()

Unnamed: 0,level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_C,Embarked_Q,Embarked_S,Title
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,0,0,1,0
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,1,0,0,2
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,0,0,1,1
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,0,0,1,2
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,0,0,1,0


In [32]:
sex_mapping = {'male': 0, 'female': 1}

df['Sex'] = df['Sex'].map(sex_mapping)

In [33]:
x = df.drop(["Survived", "PassengerId", 'Name', "Ticket", 'level_0'], axis=1)
y = df['Survived']


In [34]:
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Title
0,3,0,22.0,1,0,7.25,0,0,1,0
1,1,1,38.0,1,0,71.2833,1,0,0,2
2,3,1,26.0,0,0,7.925,0,0,1,1
3,1,1,35.0,1,0,53.1,0,0,1,2
4,3,0,35.0,0,0,8.05,0,0,1,0


In [38]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=0)


In [39]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Title
140,3,1,28.0,0,2,15.2458,1,0,0,2
439,2,0,31.0,0,0,10.5,0,0,1,0
817,2,0,31.0,1,1,37.0042,1,0,0,0
378,3,0,20.0,0,0,4.0125,1,0,0,0
491,3,0,21.0,0,0,7.25,0,0,1,0


# Model

In [40]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}
gnb = GaussianNB()
grd = GridSearchCV(gnb, param_grid, cv=5)
grd.fit(x_train, y_train)

print(grd.best_params_)
print(grd.best_score_)

{'var_smoothing': 1e-09}
0.8034078597458878


In [41]:
from sklearn.metrics import accuracy_score

gnb = GaussianNB(var_smoothing= 1e-09)
gnb.fit(x_train, y_train)
gnb_pred = gnb.predict(x_test)

print('Accuracy Score: ', round(accuracy_score(y_test, gnb_pred)* 100, 2), '%')

Accuracy Score:  78.21 %


In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}

clf = DecisionTreeClassifier()
grd = GridSearchCV(clf, param_grid, cv=5)
grd.fit(x_train, y_train)

print(grd.best_params_)
print(grd.best_score_)



{'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10}
0.8342755835713582


In [43]:
clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                             max_features='log2', min_samples_leaf=1, min_samples_split=10)
clf.fit(x_train, y_train)
clf_pred = clf.predict(x_test)

print('Accuracy Score: ', round(accuracy_score(y_test, clf_pred)* 100, 2), '%')


Accuracy Score:  77.65 %


In [44]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}

rnf = RandomForestClassifier()
grd = GridSearchCV(rnf, param_grid, cv=5)
grd.fit(x_train, y_train)

print(grd.best_params_)
print(grd.best_score_)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


{'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}
0.8412390426474934


In [45]:
rnf = RandomForestClassifier(class_weight=None, criterion='entropy', max_depth=None,
                             max_features='auto', min_samples_leaf=4, min_samples_split=5, n_estimators=100)
rnf.fit(x_train,y_train)

clf_pred = clf.predict(x_test)
print('Accuracy Score: ', round(accuracy_score(y_test, clf_pred)* 100, 2), '%')

  warn(


Accuracy Score:  77.65 %


In [46]:
from sklearn import svm

param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

svc = svm.SVC()
grd = GridSearchCV(svc, param_grid, cv=5)
grd.fit(x_train, y_train)

print(grd.best_params_)
print(grd.best_score_)