# Titanic - Machine Learning from Disaster

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Import Libraries

In [2]:
# A. Data Cleaning
# data info
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.linear_model import LinearRegression
from math import sqrt
from sklearn.metrics import mean_squared_error

# The Data

In [3]:
titanic = pd.read_csv('train.csv')
titanic.info()
titanic.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Cleaning

In [4]:
# explore the age in 3 classes.
# ready to fillna
titanic.groupby(['Pclass'])[['Age']].mean()

Unnamed: 0_level_0,Age
Pclass,Unnamed: 1_level_1
1,38.233441
2,29.87763
3,25.14062


In [5]:
# first rule pd.is null
# second rule pclass ==1, null 的column就return 38.2
# cols[] 就是得有东西装数据

def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        
        if Pclass == 1:
            return 38.2
        elif Pclass == 2:
            return 29.8
        else:
            return 25.1
    else:
        return Age

In [6]:
# fill na
titanic['Age'] = titanic[['Age', 'Pclass']].apply(impute_age,axis=1)

In [7]:
titanic.drop('Cabin',axis=1,inplace=True)

In [8]:
titanic.dropna(inplace=False)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,25.1,1,2,W./C. 6607,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C


## Converting Categorical features into dummy variable using pandas.

In [9]:
#Getting the Sex column into a machine learning algorithm column

sex = pd.get_dummies(titanic['Sex'],drop_first=True) 
pclass = pd.get_dummies(titanic['Pclass'],drop_first=True)
embark = pd.get_dummies(titanic['Embarked'],drop_first=True)

In [10]:
# Add the new columns to our data frame
titanic = pd.concat([titanic,sex,embark,pclass],axis=1)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,male,Q,S,2,3
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1,0,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0,0,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,0,1,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0,0,1,0,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1,0,1,0,1


In [11]:
# Age set 
# '0-11' for children, '12-18' for teenagers, '19-55' for adults, '56-80' elderly
# Age set 越多，score就越高
Ageset = pd.cut(titanic['Age'], [0,11,18,55,80], labels = ['0', '1','2','3'])
titanic['Ageset'] = Ageset
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,male,Q,S,2,3,Ageset
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1,0,1,0,1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0,0,0,0,0,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,0,1,0,1,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0,0,1,0,0,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1,0,1,0,1,2


In [12]:
# add fare
# based on the fare distribution
# '0-10' for 1 whom is root class, '11=80' for 2 whom is medium family, 
# '81-200' for 3 whom is welloff family, '201-600' for 4 whom is veri rich

Fareset = pd.cut(titanic['Fare'], [-1,10,80,200,600], labels = ['0','1','2','3'])
titanic['Fareset'] = Fareset
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,male,Q,S,2,3,Ageset,Fareset
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1,0,1,0,1,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0,0,0,0,0,2,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,0,1,0,1,2,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0,0,1,0,0,2,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1,0,1,0,1,2,0


In [13]:
# Now we don't need the Embarked column since we already encoded it as Q and S and we don't need
# the Sex column since we encoded it with the male column. 
# We don't need the Name and Ticket columns since we don't have enough information there for us to use it for machine learning.

titanic.drop(['Sex','Embarked','Name','Ticket','Pclass','Fare','Age','PassengerId'],axis=1,inplace=True)
titanic.head()

Unnamed: 0,Survived,SibSp,Parch,male,Q,S,2,3,Ageset,Fareset
0,0,1,0,1,0,1,0,1,2,0
1,1,1,0,0,0,0,0,0,2,1
2,1,0,0,0,0,1,0,1,2,0
3,1,1,0,0,0,1,0,0,2,1
4,0,0,0,1,0,1,0,1,2,0


# Data Modeling

In [14]:
X = titanic.drop('Survived',axis=1).values
y = titanic['Survived']

In [15]:
# train test split 是方法之一

from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

### 1. Logistic Regression

In [17]:
#Train and Predict
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
logmodel.score(X_train, y_train)

LogisticRegression()

0.8073836276083467

### 2. Decision Tree model

In [18]:
# Decision Tree Classification
clf = tree.DecisionTreeClassifier(max_depth=5, random_state=0)
clf.fit(X_train,y_train)
clf.score(X_train,y_train)

DecisionTreeClassifier(max_depth=5, random_state=0)

0.8443017656500803

In [19]:
# 纯粹用来看treemap的
from sklearn.datasets import load_iris
import graphviz 

dot_data = tree.export_graphviz(clf, out_file=None, 
                     
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data).view()


### 3. Gradient Boosting model

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

clfg = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=5, random_state=0)
clfg.fit(X_train,y_train)
clfg.score(X_train,y_train)

GradientBoostingClassifier(learning_rate=1.0, max_depth=5, random_state=0)

0.8715890850722311

### 4. Random Forest model

In [20]:
from sklearn.ensemble import RandomForestClassifier

clfr = RandomForestClassifier(n_estimators=100,max_depth=5, random_state=0)
clfr.fit(X_train,y_train)
clfr.score(X_train,y_train)

RandomForestClassifier(max_depth=5, random_state=0)

0.841091492776886

In [27]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.4.2-py3-none-win_amd64.whl (97.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2


In [37]:
from xgboost import XGBClassifier
import xgboost as xgb
clfXGB = XGBClassifier(max_depth=5, random_state=0)


clfXGB.fit(X_train,y_train)
clfXGB.score(X_train,y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

0.8683788121990369

# Predict the test data

In [22]:
titanic_test = pd.read_csv('test.csv')
titanic_test.info()
titanic_test.head()
zhaomingege = titanic_test['PassengerId']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [23]:
titanic_test.groupby(['Pclass'])[['Age']].mean()

Unnamed: 0_level_0,Age
Pclass,Unnamed: 1_level_1
1,40.918367
2,28.7775
3,24.027945


In [24]:
def impute_age_test(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        
        if Pclass == 1:
            return 40.9
        elif Pclass == 2:
            return 28.8
        else:
            return 24.0
    else:
        return Age

In [25]:
titanic_test['Age'] = titanic_test[['Age', 'Pclass']].apply(impute_age_test,axis=1)
titanic_test.drop('Cabin',axis=1,inplace=True)
titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].mean())

In [26]:
sex = pd.get_dummies(titanic_test['Sex'],drop_first=True) 
pclass = pd.get_dummies(titanic_test['Pclass'],drop_first=True)
embark = pd.get_dummies(titanic_test['Embarked'],drop_first=True)

titanic_test = pd.concat([titanic_test,sex,embark,pclass],axis=1)
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,male,Q,S,2,3
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,1,1,0,0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,0,0,1,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,1,1,0,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,1,0,1,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,0,0,1,0,1


In [27]:
Ageset = pd.cut(titanic_test['Age'], [0,11,18,55,80], labels = ['0', '1','2','3'])
titanic_test['Ageset'] = Ageset

In [28]:
Fareset = pd.cut(titanic_test['Fare'], [-1,10,80,200,600], labels = ['0','1','2','3'])
titanic_test['Fareset'] = Fareset

In [29]:
titanic_test.drop(['Sex','Embarked','Name','Ticket','Pclass','Fare','PassengerId','Age'],axis=1,inplace=True)
titanic_test.head()

Unnamed: 0,SibSp,Parch,male,Q,S,2,3,Ageset,Fareset
0,0,0,1,1,0,0,1,2,0
1,1,0,0,0,1,0,1,2,0
2,0,0,1,1,0,1,0,3,0
3,0,0,1,0,1,0,1,2,0
4,1,1,0,0,1,0,1,2,1


In [30]:
X_test = titanic_test.values
X_test

array([[0, 0, 1, ..., 1, '2', '0'],
       [1, 0, 0, ..., 1, '2', '0'],
       [0, 0, 1, ..., 0, '3', '0'],
       ...,
       [0, 0, 1, ..., 1, '2', '0'],
       [0, 0, 1, ..., 1, '2', '0'],
       [1, 1, 1, ..., 1, '2', '1']], dtype=object)

In [31]:
y_pred = clfXGB.predict(X_test)

In [32]:
res = []
for index, pred in enumerate(y_pred):
    pid = list(zhaomingege)[index]
    res.append({'PassengerId': pid, 'Survived': pred})

In [33]:
import csv

with open('wozuishuai.csv', 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['PassengerId', 'Survived'])
    writer.writeheader()
    for x in res:
        writer.writerow(x)

22

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

7

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

8

In [34]:
len(y_pred)

418

In [35]:
len(list(zhaomingege))

418