# Kaggle - Titanic

* Target : predict Survived

## Import package

In [None]:
# for data cleaning
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
# for model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

## Load the data

### Read csv

In [None]:
test = pd.read_csv('/content/test.csv')
train = pd.read_csv('/content/train.csv')

In [None]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


* **survival** : Survival
* **pclass** : Ticket class(1 = 1st, 2 = 2nd, 3 = 3rd)
* **sex** : Sex
* **Age** : Age in years
* **sibsp** : of siblings / spouses aboard the Titanic
* **parch** : of parents / children aboard the Titanic
* **ticket** : Ticket number
* **fare** : Passenger prize
* **cabin** : Cabin number
* **embarked** : Port of Embarkation(C = Cherbourg, Q = Queenstown, S = Southampton)

## Exploratory Data Analysis

### Describe of numberical feature by summary

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Describe of categorical features by frequency



In [None]:
print(train['Sex'].value_counts(dropna=True))

male      577
female    314
Name: Sex, dtype: int64


In [None]:
print(train['Embarked'].value_counts(dropna=True))

S    644
C    168
Q     77
Name: Embarked, dtype: int64


### Describe of categorical features by pivoting features

In [None]:
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [None]:
train[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [None]:
train[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [None]:
train[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


### Data exploration by visualizing data

In [None]:
# Survival
deceased = train["Survived"].value_counts(normalize = True)[0]
survived = train["Survived"].value_counts(normalize = True)[1]

x0 = ['not survived', 'survived']
y0 = [deceased, survived]

data = [go.Bar(x=x0,y=y0)]
layout = go.Layout(autosize = False, width = 300, height = 400,
              title = 'Distribution of survival')
fig = go.Figure(data = data, layout = layout)
fig.show()

* About 61.6% passengers didn't survive .

In [None]:
# Survival by Sex
fig1 = px.histogram(train, width = 300, x="Sex", color = 'Survived', text_auto=True, title = 'Survival by Sex')
fig1.show()

* Female have better survival than male.

In [None]:
# Survival and age
fig2 = px.histogram(train, width = 1000, x = 'Age',  color="Survived", text_auto=True, title = 'Survival by Age')
fig2.show()

* Most passengers are in 15-35 age range.
* Infants (Age <=6) had high survival rate.
* Oldest passengers (Age = 80) survived.

In [None]:
# Survival and Pclass
fig3 = px.histogram(train, width = 300, x=train["Pclass"].astype('str'), color = 'Survived', text_auto=True, title = 'Survival by Class')
fig3.show()

* The upper class were more likely to  have survived.

In [None]:
# Survival by Fare
n_surv = train.loc[train['Survived'] == 0, :]
surv = train.loc[train['Survived'] == 1, :]
trace0 = go.Box(x = n_surv['Fare'], name = "not survived")
trace1 = go.Box(x = surv['Fare'], name = "survived")
fare_by_survival_data = [trace0, trace1]
layout = go.Layout(xaxis = dict(title = 'Fare'),title = "Survival by Fare",
                   width = 800, height = 400)
fig4 = go.Figure(data=fare_by_survival_data, layout=layout)
fig4.show()

The fare higher one more likely to have survived.

In [None]:
# Survival by embarkation
fig5 = px.histogram(train, x = train['Embarked'], color = 'Survived', title='Survival by Embarked', width = 350)
fig5.show()

* One who embarked at Southampton have higher survival rate.

In [None]:
# Survival by Siblings Aboard
surv_sib = train.loc[train['Survived'] == 1, :]
n_surv_sib = train.loc[train['Survived'] == 0, :]

In [None]:
fig6 = make_subplots(rows = 1, cols = 2)

fig6.add_trace(go.Histogram(x = surv_sib['SibSp'],
            name = 'Survived'))

fig6.add_trace(go.Histogram(x = n_surv_sib['SibSp'],
            name = 'Not Survived'))

fig6.add_trace(go.Histogram(x = train['SibSp'],
            name = 'All'))

fig6.update_layout(title_text = 'Survival by #Siblings Aboard',
                 legend = dict(orientation = 'h'))

fig6.show()

* Having less siblings aboard were more likely to survive.

In [None]:
# Survival by Parents/Children Aboard
def is_child(age):
  # From the figure of survival by age can fond there is a gap under 14
    if age < 14:
        return float(1)
    elif age >= 14:
        return float(0)
    else:
        return float('NaN')
# apply the function to 'Age' column of the dataframe
train['Child'] = train['Age'].apply(is_child)

In [None]:
children = train['Survived'][train['Child'] == 1].value_counts(normalize = True)
adult = train['Survived'][train['Child'] == 0].value_counts(normalize = True)

# Plot survival of children vs adults
x0=['children', 'adult']
y0=[children[1], adult[1]]

data = [go.Bar(
        x=x0,
        y=y0
    )]
layout = go.Layout(autosize = False, width = 300, height = 400,
              yaxis = dict(title = 'Survival Rates'),
              title = 'Children vs Adults')
fig7 = go.Figure(data = data, layout = layout)
fig7.show()

* By the policy of children first, children indeed have higher survival.

## Data cleaning

### Feature dropping

In [None]:
train = train.drop(['Ticket', 'Cabin'], axis=1)

In [None]:
# check 
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,0.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0.0
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0.0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,0.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,0.0


### Create new feature

* Check if there was correlation between Name, PassengerId by Survival before drop it.

#### Deal with feature : Name 

In [None]:
# split title from name
train['Title'] =train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [None]:
# recode tilte
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [None]:
# drop Name and PassengerId
train = train.drop(['Name', 'PassengerId'], axis=1)

#### Deal with feature : SibSp & Parch

In [None]:
# Create a feature about how many family aboard
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [None]:
# Create a feature about if one was alone
train['IsAlone'] = 0
train.loc[train['FamilySize'] == 1, 'IsAlone'] = 1

train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


In [None]:
# drop feature
train = train.drop(['Parch', 'SibSp', 'FamilySize', 'Child'], axis=1)

### Converting categorical feature to numeric

#### Sex to numberic

In [None]:
train['Sex'] = train['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

#### Embarked to numeric

In [None]:
freq_port = train.Embarked.dropna().mode()[0]
freq_port
train['Embarked'] = train['Embarked'].fillna(freq_port)    
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [None]:
train['Embarked'] = train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

#### Title to numberic

In [None]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

train['Title'] = train['Title'].map(title_mapping)
train['Title'] = train['Title'].fillna(0)

### Deal with missing data

In [None]:
train.isna().any() # There is missing data in feature Age

Survived    False
Pclass      False
Sex         False
Age          True
Fare        False
Embarked    False
Title       False
IsAlone     False
dtype: bool

In [None]:
train['Age'].fillna(train['Age'].mean(), inplace = True) # filled with mean

In [None]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,0,3,0,22.000000,7.2500,0,1,0
1,1,1,1,38.000000,71.2833,1,3,0
2,1,3,1,26.000000,7.9250,0,2,1
3,1,1,1,35.000000,53.1000,0,3,0
4,0,3,0,35.000000,8.0500,0,1,1
...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,13.0000,0,5,1
887,1,1,1,19.000000,30.0000,0,2,1
888,0,3,1,29.699118,23.4500,0,2,0
889,1,1,0,26.000000,30.0000,1,1,1


## Deal with test data

In [None]:
test = test.drop(['Ticket', 'Cabin'], axis=1)

### Deal with feature: Name

In [None]:
# split title from name
test['Title'] =test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# recode tilte
test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')

In [None]:
# drop Name and PassengerId
test = test.drop(['Name', 'PassengerId'], axis=1)

### Deal with feature:sibSp & Parch

In [None]:
# Create a feature about how many family aboard
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
# Create a feature about if one was alone
test['IsAlone'] = 0
test.loc[test['FamilySize'] == 1, 'IsAlone'] = 1

In [None]:
# drop feature
test = test.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)

### Sex to numberic

In [None]:
test['Sex'] = test['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

### Embarked to numberic

In [None]:
test['Embarked'] = test['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

### Title to numberic

In [None]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

test['Title'] = test['Title'].map(title_mapping)
test['Title'] = test['Title'].fillna(0)

### Deal with missing data

In [None]:
test['Age'].fillna(test['Age'].mean(), inplace = True) # filled with mean
test['Fare'].fillna(test['Fare'].mean(), inplace = True) # filled with mean

In [None]:
test

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,3,0,34.50000,7.8292,2,1,1
1,3,1,47.00000,7.0000,0,3,0
2,2,0,62.00000,9.6875,2,1,1
3,3,0,27.00000,8.6625,0,1,1
4,3,1,22.00000,12.2875,0,3,0
...,...,...,...,...,...,...,...
413,3,0,30.27259,8.0500,0,1,1
414,1,1,39.00000,108.9000,1,5,1
415,3,0,38.50000,7.2500,0,1,1
416,3,0,30.27259,8.0500,0,1,1


## Model and Predict
+ Logistic Regression
+ KNN or k-Nearest Neighbors
+ Support Vector Machines
+ Naive Bayes classifier
+ Decision Tree
+ Random Forrest
+ Perceptron
+ Stochastic Gradient Descent

### Check train / test data

In [None]:
display(train, test)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,0,3,0,22.000000,7.2500,0,1,0
1,1,1,1,38.000000,71.2833,1,3,0
2,1,3,1,26.000000,7.9250,0,2,1
3,1,1,1,35.000000,53.1000,0,3,0
4,0,3,0,35.000000,8.0500,0,1,1
...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,13.0000,0,5,1
887,1,1,1,19.000000,30.0000,0,2,1
888,0,3,1,29.699118,23.4500,0,2,0
889,1,1,0,26.000000,30.0000,1,1,1


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,3,0,34.50000,7.8292,2,1,1
1,3,1,47.00000,7.0000,0,3,0
2,2,0,62.00000,9.6875,2,1,1
3,3,0,27.00000,8.6625,0,1,1
4,3,1,22.00000,12.2875,0,3,0
...,...,...,...,...,...,...,...
413,3,0,30.27259,8.0500,0,1,1
414,1,1,39.00000,108.9000,1,5,1
415,3,0,38.50000,7.2500,0,1,1
416,3,0,30.27259,8.0500,0,1,1


### Split train data

In [None]:
X_train = train.drop("Survived", axis=1)
Y_train = train["Survived"]
X_test  = test.copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 7), (891,), (418, 7))

### Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

80.92

In [None]:
# check coeff
coeff_df = pd.DataFrame(train.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
1,Sex,2.204556
5,Title,0.394833
4,Embarked,0.318628
6,IsAlone,0.293328
3,Fare,0.001225
2,Age,-0.032276
0,Pclass,-1.113237


### KNN or k-Nearest Neighbors

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

84.96

### Support Vector Machines

In [None]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

68.57

### Naive Bayes classifier

In [None]:
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

77.67

### Decision Tree

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

98.43

### Random Forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

98.43

### Perceptron

In [None]:
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

69.47

### Stochastic Gradient Descent

In [None]:
sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

78.11

## Model evaluation

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,98.43
7,Decision Tree,98.43
1,KNN,84.96
2,Logistic Regression,80.92
6,Stochastic Gradient Decent,78.11
4,Naive Bayes,77.67
5,Perceptron,69.47
0,Support Vector Machines,68.57


In [None]:
test = pd.read_csv('/content/test.csv')

In [None]:
Y_pred = random_forest.predict(X_test)
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": Y_pred
    })

In [None]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
submission.to_csv('submission.csv', index=False)