In [1]:
#This project is based on Kaggle notebook created by Manav Sehgal. 
#Titanic Data Science Solutions(hereinafter referred to as REF1)
#URL: https://www.kaggle.com/startupsci/titanic-data-science-solutions
#Import packages
#Tools
import numpy as np #Linear Algebra
import pandas as pd #Data processing, CSV file I/O 
import warnings
warnings.filterwarnings('ignore')

#Plotting
import seaborn as sns
import matplotlib.pyplot as plt

#Models 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier

#Feature Selection
from sklearn.feature_selection import RFE

In [2]:
#Read raw data from csv files
train = pd.read_csv('/Users/joy/Downloads/mp2/train.csv')
test = pd.read_csv('/Users/joy/Downloads/mp2/test.csv')
#Combine the training set and test set. 
full=pd.concat([train,test], ignore_index=True)
#Save these values for the models
y_train=train['Survived']
id_test=test['PassengerId']

In [3]:
#Let us have a look at the data information
full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [4]:
#Count the amount of missing values of features
full.isnull().sum()

Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64

In [5]:
#Cabin has 1014 missing values, Age has 263 missing values, and Embarked and Fare have few missing values.
#The missing values in Cabin is too many, and in our reference, it is dropped from the beginning. 
#But we decided to fill its missing values and use feature selection to decide if it should be removed. 
#Let us have a look at the data.
full.head(5)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [6]:
#Values in Ticket are meaningless and chaotic, so we decided not to use them. 
#PassengerId is a meaningless primary key, which also can be removed.
#As for Name, there are titles included in Name. 

In [7]:
full.drop(['PassengerId','Ticket'], axis=1, inplace=True)

In [8]:
#To fill the missing values in Age, which is an important feature from our subjective view, 
#we will use information in Name feature to estimate. 
#We used Regex to find title in each Name. 
#Make new column Title to store the value. (REF1)
full['Title']=full['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [9]:
#Have a look at each Title (REF1)
pd.crosstab(full['Title'],full['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,4
Countess,1,0
Don,0,1
Dona,1,0
Dr,1,7
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,61


In [10]:
#There are some Titles that have too few examples. We can combine them
#Also, some Titles have similar meaning. For example, Mlle and Miss have the same meaning, unmarried women. 
full['Title']=full['Title'].replace(['Capt','Col','Countess','Don','Dona','Jonkheer','Lady','Major','Rev','Sir'],'Rare')
full['Title']=full['Title'].replace('Mme','Mrs')
full['Title']=full['Title'].replace(['Mlle','Ms','Dona'],'Miss')
full['Title']=full['Title'].replace('Don','Mr')

In [11]:
#Create a mapping for title, replace string values with int
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Dr':5, 'Rare': 6}
full['Title'] = full['Title'].map(title_mapping)

In [12]:
full[['Age','Title']].groupby(by='Title', as_index=False).mean().sort_values(by='Age', ascending=True)

Unnamed: 0,Title,Age
3,4,5.482642
1,2,21.824366
0,1,32.252151
2,3,36.918129
4,5,43.571429
5,6,45.714286


In [13]:
#Fill the missing Age with median age of each Title
full['Age']=full.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))

In [14]:
#Now Title and Name are useless. Drop it.
full.drop(['Title','Name'], axis=1, inplace=True)

In [15]:
#Let us fill the missing Embarked.
#Check the values in Embarked.
pd.crosstab(full['Embarked'],full['Survived'])

Survived,0.0,1.0
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,75,93
Q,47,30
S,427,217


In [16]:
#First we create a mapping for Embarked.
embarked_mapping={'S':1, 'C':2, 'Q':3}
full['Embarked']=full['Embarked'].map(embarked_mapping)

In [17]:
#Fill the missing fare according to the class
full['Fare']=full.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))

In [18]:
#Then fill the missing Embarked
#People who embarked on the same port and entered the same class may pay similar fare. (Simialr Fare range)
#Create FareBand to estimate embarked
full.loc[ full['Fare'] <= 7.91, 'FareBand'] = 0
full.loc[(full['Fare'] > 7.91) & (full['Fare'] <= 14.45), 'FareBand'] = 1
full.loc[(full['Fare'] > 14.45) & (full['Fare'] <= 31), 'FareBand'] = 2
full.loc[ full['Fare'] > 31, 'FareBand'] = 3
full['Embarked']=full.groupby('FareBand')['Embarked'].transform(lambda x: x.fillna(x.median()))

In [19]:
#For cabin, we will only use the letters in cabin to analyze
full['CabinLetter']=full['Cabin'].str.extract('([A-Z])', expand=False)
pd.crosstab(full['CabinLetter'],full['Sex'])

Sex,female,male
CabinLetter,Unnamed: 1_level_1,Unnamed: 2_level_1
A,4,18
B,36,29
C,46,48
D,23,23
E,19,22
F,8,13
G,5,0
T,0,1


In [20]:
#Create a mapping for CabinLetter, and replace Cabin value with these integers. 
cabin_mapping={'A': 0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7}
full['Cabin']=full['CabinLetter'].map(cabin_mapping)

In [21]:
#Fill Cabin according to Pclass and FareBand
full['Cabin']=full.groupby(['Pclass','FareBand'])['Cabin'].transform(lambda x: x.fillna(x.median()))
full.drop('FareBand',axis=1,inplace=True)

In [22]:
#Now we can check the missing values in cabin
full['Cabin'].isnull().sum()

59

In [23]:
#Fill the remaining missing values only according to Pclass
full['Cabin']=full.groupby('Pclass')['Cabin'].transform(lambda x: x.fillna(x.median()))

In [24]:
full.drop('CabinLetter',axis=1,inplace=True)

In [25]:
#Create a mapping for Sex(To use RFE, all values should be float type)
sex_mapping={'female':0, 'male':1}
full['Sex']=full['Sex'].map(sex_mapping)

In [26]:
full.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived
0,22.0,5.0,1.0,7.25,0,3,1,1,0.0
1,38.0,2.0,2.0,71.2833,0,1,0,1,1.0
2,26.0,4.0,1.0,7.925,0,3,0,0,1.0
3,35.0,2.0,1.0,53.1,0,1,0,1,1.0
4,35.0,4.0,1.0,8.05,0,3,1,0,0.0


In [27]:
full.drop('Survived',axis=1,inplace=True)

In [28]:
x_train=full.iloc[:891,:]
x_test=full.iloc[891:,:]

In [29]:
full.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Parch,Pclass,Sex,SibSp
0,22.0,5.0,1.0,7.25,0,3,1,1
1,38.0,2.0,2.0,71.2833,0,1,0,1
2,26.0,4.0,1.0,7.925,0,3,0,0
3,35.0,2.0,1.0,53.1,0,1,0,1
4,35.0,4.0,1.0,8.05,0,3,1,0


In [30]:
#We use RFE to do feature selection. 
#First we introduce all methods. 

# Decision Tree
decision_tree = DecisionTreeClassifier()

# Random Forest
random_forest = RandomForestClassifier(n_estimators=100)

# Logistic Regression
logreg = LogisticRegression()

# Perception
perceptron = Perceptron()

In [31]:
#Find the names of all features
names=list(full)
#Use RFE to rank features for each method
#DT
rfe_dt = RFE(decision_tree, n_features_to_select=1)
rfe_dt.fit(x_train,y_train)
print('Decision Tree Feature Ranking:')
print(sorted(zip(map(lambda x: round(x, 4), rfe_dt.ranking_), names)))

Decision Tree Feature Ranking:
[(1, 'Fare'), (2, 'Sex'), (3, 'Age'), (4, 'Pclass'), (5, 'SibSp'), (6, 'Cabin'), (7, 'Parch'), (8, 'Embarked')]


In [32]:
#RF
rfe_rf = RFE(random_forest, n_features_to_select=1)
rfe_rf.fit(x_train,y_train)
print('Random Forest Feature Ranking:')
print(sorted(zip(map(lambda x: round(x, 4), rfe_rf.ranking_), names)))

Random Forest Feature Ranking:
[(1, 'Fare'), (2, 'Age'), (3, 'Sex'), (4, 'Pclass'), (5, 'Cabin'), (6, 'SibSp'), (7, 'Parch'), (8, 'Embarked')]


In [33]:
#LG
rfe_lg = RFE(logreg, n_features_to_select=1)
rfe_lg.fit(x_train,y_train)
print('Logistic Regression Feature Ranking:')
print(sorted(zip(map(lambda x: round(x, 4), rfe_lg.ranking_), names)))

Logistic Regression Feature Ranking:
[(1, 'Sex'), (2, 'Pclass'), (3, 'Embarked'), (4, 'SibSp'), (5, 'Cabin'), (6, 'Age'), (7, 'Parch'), (8, 'Fare')]


In [34]:
#PER
rfe_per = RFE(perceptron, n_features_to_select=1)
rfe_per.fit(x_train,y_train)
print('Perceptron Feature Ranking:')
print(sorted(zip(map(lambda x: round(x, 4), rfe_per.ranking_), names)))

Perceptron Feature Ranking:
[(1, 'Sex'), (2, 'Pclass'), (3, 'SibSp'), (4, 'Parch'), (5, 'Embarked'), (6, 'Fare'), (7, 'Age'), (8, 'Cabin')]


In [35]:
#Features Embarked, Fare, Cabin have been 
final=full.drop(['Fare','Cabin','Embarked'],axis=1)

In [36]:
final.head(5)

Unnamed: 0,Age,Parch,Pclass,Sex,SibSp
0,22.0,0,3,1,1
1,38.0,0,1,0,1
2,26.0,0,3,0,0
3,35.0,0,1,0,1
4,35.0,0,3,1,0


In [37]:
#To use one hot, all data type should be string. 
final=final.astype('str')
final=pd.get_dummies(final)

In [38]:
#After one hot, check out the data
final.head(5)

Unnamed: 0,Age_0.17,Age_0.33,Age_0.42,Age_0.67,Age_0.75,Age_0.83,Age_0.92,Age_1.0,Age_10.0,Age_11.0,...,Pclass_3,Sex_0,Sex_1,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8
0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0


In [39]:
x_train=final.iloc[:891,:]
x_test=final.iloc[891:,:]

In [40]:
#Fit and predict using Logistic Regression
logreg.fit(x_train, y_train)
lg_pred = logreg.predict(x_test)
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
acc_log

83.61

In [41]:
#Fit and predict using Perceptron
perceptron.fit(x_train, y_train)
per_pred = perceptron.predict(x_test)
acc_perceptron = round(perceptron.score(x_train, y_train) * 100, 2)
acc_perceptron

83.5

In [42]:
#Fit and predict using Random Forest
random_forest.fit(x_train, y_train)
rf_pred = random_forest.predict(x_test)
random_forest.score(x_train, y_train)
acc_random_forest = round(random_forest.score(x_train, y_train) * 100, 2)
acc_random_forest

92.03

In [43]:
#Fit and predict using Decision Tree
decision_tree.fit(x_train, y_train)
dt_pred = decision_tree.predict(x_test)
acc_decision_tree = round(decision_tree.score(x_train, y_train) * 100, 2)
acc_decision_tree

92.03

In [44]:
#Export data to csv files
submission = pd.DataFrame({
        "PassengerId": id_test,
        "Survived": lg_pred
    })
submission.to_csv('/Users/joy/Downloads/mp2/submissionlg.csv', index=False)

In [45]:
submission = pd.DataFrame({
        "PassengerId": id_test,
        "Survived": rf_pred
    })
submission.to_csv('/Users/joy/Downloads/mp2/submissionrf.csv', index=False)

In [46]:
submission = pd.DataFrame({
        "PassengerId": id_test,
        "Survived": dt_pred
    })
submission.to_csv('/Users/joy/Downloads/mp2/submissiondt.csv', index=False)

In [47]:
submission = pd.DataFrame({
        "PassengerId": id_test,
        "Survived": per_pred
    })
submission.to_csv('/Users/joy/Downloads/mp2/submissionper.csv', index=False)

In [48]:
#When applying the models on training set, Decision Tree and Random Forest have the best scores. 
#However, when applying them on the test set, Logistic Regression got the best accuracy. 
#The best score of these four submissions is 0.76076