In [524]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import math

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

### Data Dictionary
Variable	Definition	Key
survival	Survival	0 = No, 1 = Yes
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	Sex	
Age	Age in years	
sibsp	# of siblings / spouses aboard the Titanic	
parch	# of parents / children aboard the Titanic	
ticket	Ticket number	
fare	Passenger fare	
cabin	Cabin number	
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton
Variable Notes
pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

In [525]:
train_df = pd.read_csv('/home/mathewlech/sandbox_local/input/titanic/train.csv')
test_df = pd.read_csv('/home/mathewlech/sandbox_local/input/titanic/test.csv')
combine = [train_df, test_df]
transformed_Sex = False
transformed_Embark = False

age_buckets = 20
fare_buckets = 10

#train_df.info()
print("--"*40)
#test_df.info()

--------------------------------------------------------------------------------


* 77% of cabin is null probably not much use for this column
* passengerID doesn't intrinsically give us any real information
* Ticket doesnt seemt to provide insight


### Data manipulations

In [526]:
if 'Cabin' in train_df:
    train_df = train_df.drop(['Cabin'], axis=1)
if 'Cabin' in test_df:
    test_df = test_df.drop(['Cabin'], axis=1)       
    
if 'Ticket' in train_df:
    train_df = train_df.drop(['Ticket'], axis=1)
if 'Ticket' in test_df:
    test_df = test_df.drop(['Ticket'], axis=1)    
    
combine = [train_df, test_df]    

In [527]:
freq_port = train_df.Embarked.dropna().mode()[0]

In [528]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int) 

In [529]:
train_df['FamilySize']  = 0
test_df['FamilySize']  = 0
combine = [train_df, test_df]
for dataset in combine:        
        dataset['FamilySize'] = (dataset['Parch'] + dataset['SibSp']).astype(int) + 1        

In [530]:
train_df['IsAlone'] = 0
test_df['IsAlone'] = 0
train_df.loc[ (train_df.FamilySize == 1), 'IsAlone'] = 1
test_df.loc[ (train_df.FamilySize == 1), 'IsAlone'] = 1

if 'Parch' in test_df:
    test_df = test_df.drop(['Parch'], axis=1)  
if 'SibSp' in test_df:
    test_df = test_df.drop(['SibSp'], axis=1)      

if 'Parch' in train_df:
    train_df = train_df.drop(['Parch'], axis=1)  
if 'SibSp' in train_df:
    train_df = train_df.drop(['SibSp'], axis=1)      

if 'FamilySize' in train_df:
    train_df = train_df.drop(['FamilySize'], axis=1)  
    
if 'FamilySize' in test_df:
    test_df = test_df.drop(['FamilySize'], axis=1)   
    
combine = [train_df, test_df]    

In [531]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

In [532]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [533]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
combine = [train_df, test_df] 

### Age band Feature

In [534]:
guess_ages = np.zeros((2,3))
guess_ages

array([[0., 0., 0.],
       [0., 0., 0.]])

In [535]:
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()

            # age_mean = guess_df.mean()
            # age_std = guess_df.std()
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

In [536]:
train_df['AgeBand'] = pd.cut(train_df['Age'], age_buckets,include_lowest=True).astype(str).str.strip('()[]')
age_band_buckets = train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)
age_band_buckets = age_band_buckets['AgeBand'].str.split(',', expand=True)
age_band_buckets.rename(columns={ age_band_buckets.columns[0]: "Min" , age_band_buckets.columns[1]: "Max"}, inplace=True)
age_band_buckets = age_band_buckets.dropna()
age_band_buckets.Min = age_band_buckets.Min.astype(float)
age_band_buckets.Max = age_band_buckets.Max.astype(float)
age_band_buckets.sort_values(by='Min', ascending=True,inplace=True)
#age_band_buckets.head(100).sort_values(by='Min', ascending=True)

In [537]:
train_df['AgeInt'] = -1
test_df['AgeInt'] = -1
combine = [train_df, test_df] 
a_rowcount = age_band_buckets.shape[0] -1
a_iterplace = 0
for index, row in age_band_buckets.sort_values(by='Min', ascending=True).iterrows():
    minval =  row['Min']
    maxval = row['Max']
    
    if a_iterplace == a_rowcount:#need to handle max value truncation        
        maxval += 1        
        
    a_iterplace +=1
    for dataset in combine:                         
        dataset.loc[(dataset['Age'] >= minval) & (dataset['Age'] <= maxval), 'AgeInt'] = index        

### Fare band feature

In [538]:
train_df['FareBand'] = pd.qcut(train_df['Fare'], fare_buckets).astype(str).str.strip('()[]')
Fare_band_buckets = train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)
Fare_band_buckets = Fare_band_buckets['FareBand'].str.split(',', expand=True)
Fare_band_buckets.rename(columns={ Fare_band_buckets.columns[0]: "Min" , Fare_band_buckets.columns[1]: "Max"}, inplace=True)
Fare_band_buckets = Fare_band_buckets.dropna()
Fare_band_buckets.Min = Fare_band_buckets.Min.astype(float)
Fare_band_buckets.Max = Fare_band_buckets.Max.astype(float)
Fare_band_buckets.sort_values(by='Min', ascending=True,inplace=True)
#Fare_band_buckets.head(100).sort_values(by='Min', ascending=True)

In [539]:
train_df['FareInt'] = -1
test_df['FareInt'] = -1
combine = [train_df, test_df] 
f_rowcount = Fare_band_buckets.shape[0] -1
f_iterplace = 0
for index, row in Fare_band_buckets.sort_values(by='Min', ascending=True).iterrows():
    minval =  row['Min']
    maxval = row['Max']
    
    if f_iterplace == f_rowcount:#need to handle max value truncation        
        maxval += 1        
        
    f_iterplace +=1
    for dataset in combine:                         
        dataset.loc[(dataset['Fare'] >= minval) & (dataset['Fare'] <= maxval), 'FareInt'] = index        

In [540]:
if 'FareBand' in test_df:
    test_df = test_df.drop(['FareBand'], axis=1)  
if 'SibSp' in test_df:
    test_df = test_df.drop(['SibSp'], axis=1)      

if 'FareBand' in train_df:
    train_df = train_df.drop(['FareBand'], axis=1)  
if 'AgeBand' in train_df:
    train_df = train_df.drop(['AgeBand'], axis=1)    
    
if 'Fare' in train_df:
    train_df = train_df.drop(['Fare'], axis=1)  
if 'Age' in train_df:
    train_df = train_df.drop(['Age'], axis=1)   
    
if 'Fare' in test_df:
    test_df = test_df.drop(['Fare'], axis=1)  
if 'Age' in test_df:
    test_df = test_df.drop(['Age'], axis=1)    
    
combine = [train_df, test_df] 

### use name to generate a title

In [541]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

train_df['Title'].value_counts() 

Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: Title, dtype: int64

In [542]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

In [543]:
if 'Name' in test_df:
    test_df = test_df.drop(['Name'], axis=1)  
if 'Name' in train_df:
    train_df = train_df.drop(['Name'], axis=1)    
    
combine = [train_df, test_df] 

In [544]:
#train_df.info()
print("--"*40)
#test_df.info()

--------------------------------------------------------------------------------


In [545]:
#colormap = plt.cm.RdBu
#plt.figure(figsize=(14,12))
#plt.title('Pearson Correlation of Features', y=1.05, size=15)
#sns.heatmap(train_df[["AgeInt", "Pclass","FareInt","Survived","Sex",'IsAlone','Title']].astype(float).corr(),linewidths=0.1,vmax=1.0, 
#            square=True, cmap=colormap, linecolor='white', annot=True)

In [546]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Embarked,IsAlone,AgeInt,FareInt,Title
0,892,3,0,2,0,6,6,1
1,893,3,1,0,0,10,0,3
2,894,2,0,2,1,14,9,1
3,895,3,0,0,0,4,9,1
4,896,3,1,0,1,3,1,3


## Machine Learning

In [561]:
X_train = train_df.drop("Survived", axis=1)
X_train = X_train.drop("PassengerId", axis=1)
Y_train = train_df["Survived"]

X_test = test_df
X_test = X_test.drop("PassengerId", axis=1)

X_train.shape, Y_train.shape, X_test.shape

((891, 7), (891,), (418, 7))

In [556]:
mlp = MLPClassifier(solver='lbfgs', alpha=1e-3,
                     hidden_layer_sizes=(5, 2), random_state=1)

logreg = LogisticRegression()
ridge = RidgeClassifier()
svc = SVC()
knn = KNeighborsClassifier(n_neighbors = 3)
perceptron = Perceptron(alpha=1e-5)
gaussian = GaussianNB()
linear_svc = LinearSVC()
sgd = SGDClassifier()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier(n_estimators=100)

models ={}
models["logreg"] = logreg
models["ridge"] = ridge
models["svc"] = svc
models["knn"] = knn
models["perceptron"] = perceptron
models["mlp"] = mlp
models["gaussian"] = gaussian
models["linear_svc"] = linear_svc
models["sgd"] = sgd
models["decision_tree"] = decision_tree
models["random_forest"] = random_forest

In [557]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import cross_val_score
scores_df = pd.DataFrame()
#allcoeffs = pd.DataFrame(columns=['Feature','Model','Correlation'])
for name, model in models.items():
    model.fit(X_train, Y_train)
    X_Pred = model.predict(X_train)
        
    sc = round(model.score(X_train, Y_train) * 100, 2)    
    precision, recall, fscore, support = score(Y_train, X_Pred)

    #coeff_df = pd.DataFrame(X_train.columns.delete(0))
    #coeff_df['Model'] = name
    #coeff_df.rename(columns={ coeff_df.columns[0]: "Feature" }, inplace=True)
    #coeff_df["Correlation"] = pd.Series(model.coef_[0])    
    
    #allcoeffs = allcoeffs.append(coeff_df)
    
    cvscore = cross_val_score(model, X_train, Y_train,
                                scoring='accuracy', cv=4)
    
    cv_mean = round(cvscore.mean() * 100, 2)   
    cv_stddev = round(cvscore.std() * 100 * 2, 2)   
    scores_df = scores_df.append({
                                    'Model':name, 
                                    'Score':sc,
                                    'CV_Mean': cv_mean,
                                    'CV_SDEV': cv_stddev,
                                    'Precision': precision,
                                    'recall': recall,
                                    'fscore': fscore,
                                    'support': support                                    
                                
    }, ignore_index=True)



In [558]:
scores_df = scores_df[['Model', 'Score', 'CV_Mean','CV_SDEV', 'Precision','recall','fscore','support',]]
scores_df.sort_values(by='Score', ascending=False,inplace=True)
scores_df.head(100)

Unnamed: 0,Model,Score,CV_Mean,CV_SDEV,Precision,recall,fscore,support
9,decision_tree,92.59,80.02,2.99,"[0.9086294416243654, 0.96]","[0.9781420765027322, 0.8421052631578947]","[0.9421052631578947, 0.8971962616822429]","[549, 342]"
10,random_forest,92.59,82.16,1.54,"[0.9142367066895368, 0.948051948051948]","[0.970856102003643, 0.8538011695906432]","[0.941696113074205, 0.8984615384615385]","[549, 342]"
2,svc,87.21,81.15,6.68,"[0.8717948717948718, 0.8725490196078431]","[0.9289617486338798, 0.7807017543859649]","[0.8994708994708994, 0.8240740740740741]","[549, 342]"
3,knn,86.76,78.45,3.29,"[0.8983364140480592, 0.82]","[0.8852459016393442, 0.8391812865497076]","[0.8917431192660552, 0.8294797687861271]","[549, 342]"
5,mlp,83.39,80.92,5.55,"[0.8325041459369817, 0.8368055555555556]","[0.9143897996357013, 0.7046783625730995]","[0.8715277777777778, 0.7650793650793651]","[549, 342]"
0,logreg,79.57,79.01,3.05,"[0.8259325044404974, 0.7439024390243902]","[0.8469945355191257, 0.7134502923976608]","[0.8363309352517986, 0.7283582089552239]","[549, 342]"
1,ridge,79.57,79.57,3.05,"[0.8259325044404974, 0.7439024390243902]","[0.8469945355191257, 0.7134502923976608]","[0.8363309352517986, 0.7283582089552239]","[549, 342]"
7,linear_svc,79.57,79.23,3.89,"[0.8247787610619469, 0.745398773006135]","[0.848816029143898, 0.7105263157894737]","[0.8366247755834829, 0.7275449101796406]","[549, 342]"
6,gaussian,78.45,77.89,4.09,"[0.8413001912045889, 0.7038043478260869]","[0.8014571948998178, 0.7573099415204678]","[0.8208955223880596, 0.7295774647887323]","[549, 342]"
4,perceptron,77.33,73.18,5.3,"[0.8699360341151386, 0.6658767772511849]","[0.7431693989071039, 0.8216374269005848]","[0.8015717092337918, 0.7356020942408377]","[549, 342]"


In [559]:
scores_df.sort_values(by='CV_Mean', ascending=False,inplace=True)
scores_df.head(100)

Unnamed: 0,Model,Score,CV_Mean,CV_SDEV,Precision,recall,fscore,support
10,random_forest,92.59,82.16,1.54,"[0.9142367066895368, 0.948051948051948]","[0.970856102003643, 0.8538011695906432]","[0.941696113074205, 0.8984615384615385]","[549, 342]"
2,svc,87.21,81.15,6.68,"[0.8717948717948718, 0.8725490196078431]","[0.9289617486338798, 0.7807017543859649]","[0.8994708994708994, 0.8240740740740741]","[549, 342]"
5,mlp,83.39,80.92,5.55,"[0.8325041459369817, 0.8368055555555556]","[0.9143897996357013, 0.7046783625730995]","[0.8715277777777778, 0.7650793650793651]","[549, 342]"
9,decision_tree,92.59,80.02,2.99,"[0.9086294416243654, 0.96]","[0.9781420765027322, 0.8421052631578947]","[0.9421052631578947, 0.8971962616822429]","[549, 342]"
1,ridge,79.57,79.57,3.05,"[0.8259325044404974, 0.7439024390243902]","[0.8469945355191257, 0.7134502923976608]","[0.8363309352517986, 0.7283582089552239]","[549, 342]"
7,linear_svc,79.57,79.23,3.89,"[0.8247787610619469, 0.745398773006135]","[0.848816029143898, 0.7105263157894737]","[0.8366247755834829, 0.7275449101796406]","[549, 342]"
0,logreg,79.57,79.01,3.05,"[0.8259325044404974, 0.7439024390243902]","[0.8469945355191257, 0.7134502923976608]","[0.8363309352517986, 0.7283582089552239]","[549, 342]"
3,knn,86.76,78.45,3.29,"[0.8983364140480592, 0.82]","[0.8852459016393442, 0.8391812865497076]","[0.8917431192660552, 0.8294797687861271]","[549, 342]"
6,gaussian,78.45,77.89,4.09,"[0.8413001912045889, 0.7038043478260869]","[0.8014571948998178, 0.7573099415204678]","[0.8208955223880596, 0.7295774647887323]","[549, 342]"
8,sgd,74.52,73.52,11.48,"[0.8779342723004695, 0.6236559139784946]","[0.6812386156648452, 0.847953216374269]","[0.7671794871794871, 0.7187112763320942]","[549, 342]"


In [562]:
# Random Forest is the best algo to use for prediction
Y_pred = random_forest.predict(X_test)
X_test = test_df

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })


In [564]:
submission.to_csv('/home/mathewlech/sandbox_local/output/titanic/submission.csv', index=False)
submission.head(500)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0
