In [1]:
# Importing Python libraries

# General tools:
import math, os, sys # standart python libraries
import numpy as np
import pandas as pd # for dataframes
import itertools # combinatorics toolkit
import time # for obtaining computation execution times
from scipy import interp # interpolation function

# Data Pre-Processing:
from sklearn.preprocessing import StandardScaler # for standardizing data
from collections import Counter # object class for counting element occurences

# Machine Learning Classifiers:
from xgboost import XGBClassifier # xgboost classifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron # Linear classifiers
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier # Decision tree classifiers
from sklearn.svm import SVC # Support-vector machine classifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # LDA classifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier # Nearest-Neighbors classifier

# Feature and model selection:
from sklearn.model_selection import StratifiedKFold # train/test splitting tool for cross-validation
from sklearn.model_selection import GridSearchCV # hyperparameter optimization tool via exhaustive search
from sklearn.model_selection import cross_val_score # automates cross-validated scoring
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc # scoring metrics
from sklearn.feature_selection import RFE # recursive feature elimination
from sklearn.model_selection import learning_curve # learning-curve generation for bias-variance tradeoff
from sklearn.model_selection import validation_curve # for fine-tunning hyperparameters
from sklearn.pipeline import Pipeline

# Plotting:
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

from matplotlib import rcParams

# Manage Warnings:
import warnings
warnings.filterwarnings ('ignore')

# Ensure Jupyter Notebook plots the figures in-line:
%matplotlib inline
rcParams['figure.figsize'] = 5, 4
sns.set_style('whitegrid')

In [2]:
# Importing the data

df_train = pd.read_csv('C:\\Users\\PC-Home\\OneDrive\\Documentos\\Projetos\\Kaggle - Titanic\\train.csv')
df_test = pd.read_csv('C:\\Users\\PC-Home\\OneDrive\\Documentos\\Projetos\\Kaggle - Titanic\\test.csv')
dataset = pd.concat([df_train, df_test]) # combined dataset
test_Ids = df_test['PassengerId'] # identifiers for test set (besides survived = NaN)

dataset_original = pd.concat([df_train, df_test])

df_train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
dataset_original.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


Data Dictionary

A few notes below about the meaning of the features in the raw dataset:

Survival: 0 = False (Deceased), 1 = True (Survived).
Pclass: Passenger ticket class; 1 = 1st (upper class), 2 = 2nd (middle class), 3 = 3rd (lower class).
SibSp: Passenger's total number of siblings (including step-siblings) and spouses (legal) aboard the Titanic.
Parch: Passenger's total number of parents or children (including stepchildren) aboard the Titanic.
Embarked: Port of Embarkation, where C = Cherbourg, Q = Queenstown, S = Southampton.
Age: Ages under 1 are given as fractions; if the age is estimated, it is in the form of xx.5.

In [4]:
# Analysing data completeness

print('Traning Set Dataframe Shape: ', df_train.shape)
print('Test Set Dataframe Shape: ', df_test.shape)
print('\nTotal numver of entries in our dataset: ', dataset.shape[0])

percent_missing_train = df_train.isnull().sum()/df_train.shape[0]*100
percent_missing_test = df_test.isnull().sum()/df_test.shape[0]*100
percent_missing_total = dataset.isnull().sum()/dataset.shape[0]*100

missing_value_df = pd.DataFrame({'percent_missing_train': percent_missing_train,
                                'percent_missing_test': percent_missing_test,
                                'percent_missing_total': percent_missing_total})

missing_value_df



Traning Set Dataframe Shape:  (891, 12)
Test Set Dataframe Shape:  (418, 11)

Total numver of entries in our dataset:  1309


Unnamed: 0,percent_missing_train,percent_missing_test,percent_missing_total
Age,19.86532,20.574163,20.091673
Cabin,77.104377,78.229665,77.463713
Embarked,0.224467,0.0,0.152788
Fare,0.0,0.239234,0.076394
Name,0.0,0.0,0.0
Parch,0.0,0.0,0.0
PassengerId,0.0,0.0,0.0
Pclass,0.0,0.0,0.0
Sex,0.0,0.0,0.0
SibSp,0.0,0.0,0.0


In [5]:
df_train.sort_values('Name')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
845,846,0,3,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.5500,,S
746,747,0,3,"Abbott, Mr. Rossmore Edward",male,16.0,1,1,C.A. 2673,20.2500,,S
279,280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.2500,,S
308,309,0,2,"Abelson, Mr. Samuel",male,30.0,1,0,P/PP 3381,24.0000,,C
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28.0,1,0,P/PP 3381,24.0000,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
286,287,1,3,"de Mulder, Mr. Theodore",male,30.0,0,0,345774,9.5000,,S
282,283,0,3,"de Pelsmaeker, Mr. Alfons",male,16.0,0,0,345778,9.5000,,S
361,362,0,2,"del Carlo, Mr. Sebastiano",male,29.0,1,0,SC/PARIS 2167,27.7208,,C
153,154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5000,,S


In [6]:
# Feature Engineering

# Create a new column as a sum of listed relatives
dataset['FamilySize'] = dataset['Parch'] + dataset['SibSp'] + 1 # plus one to include the passenger

# Clean and sub-divide the name data into two new columns using Python's str.split() function.
# A look at the CSV contents shows that we should first split at ',' to isolate the surname, and the split again at '.' 
# to isolate the title.
dataset['Surname'] = dataset['Name'].str.split(',', expand=True)[0]

dataset['Title'] = dataset['Name'].str.split(',', expand=True)[1].str.split('.', expand=True)[0]

# Create a new feature identifying children (15 or younger)
dataset['IsChild'] = np.where(dataset['Age'] < 16, 1, 0)

# We can save this for handling or viewing with external software
# dataset.to_csv('C:\\Users\\PC-Home\\OneDrive\\Documentos\\Projetos\\Kaggle - Titanic\\combined_newvars_v1.csv')

# Now let's print part of the dataframe to check our new variables definitions

dataset[['Name', 'Surname', 'Title', 'SibSp', 'Parch', 'FamilySize', 'Age', 'IsChild']].head(10)

Unnamed: 0,Name,Surname,Title,SibSp,Parch,FamilySize,Age,IsChild
0,"Braund, Mr. Owen Harris",Braund,Mr,1,0,2,22.0,0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Cumings,Mrs,1,0,2,38.0,0
2,"Heikkinen, Miss. Laina",Heikkinen,Miss,0,0,1,26.0,0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Futrelle,Mrs,1,0,2,35.0,0
4,"Allen, Mr. William Henry",Allen,Mr,0,0,1,35.0,0
5,"Moran, Mr. James",Moran,Mr,0,0,1,,0
6,"McCarthy, Mr. Timothy J",McCarthy,Mr,0,0,1,54.0,0
7,"Palsson, Master. Gosta Leonard",Palsson,Master,3,1,5,2.0,1
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",Johnson,Mrs,0,2,3,27.0,0
9,"Nasser, Mrs. Nicholas (Adele Achem)",Nasser,Mrs,1,0,2,14.0,1


In [11]:
# Grouping families and travellers

# Create mappings for assigning GroupID, GroupType, GroupSize, GroupNumSurvived and GroupNumPerished
group_id = 1
ticket_to_group_id = {}
ticket_to_group_type = {}
ticket_to_group_size = {}
ticket_to_group_num_survived = {}
ticket_to_group_num_perished = {}
for (ticket, group) in dataset.groupby('Ticket'):
    
    # Categorize group type (Family, Non-Family, Mixed and IsAlone)
    num_names = len(set(group['Surname'].values)) # number of unique names in this group (o comando "set" traz valores únicos)
    group_size = len(group['Surname'].values) # total size of this group
    if group_size > 1:
        if num_names == 1:
            ticket_to_group_type[ticket] = 'Family'
        elif num_names == group_size:
            ticket_to_group_type[ticket] = 'NonFamily'
        else:
            ticket_to_group_type[ticket] = 'Mixed'
    else:
        ticket_to_group_type[ticket] = 'IsAlone'
    
    # assign group size and group identifier
    ticket_to_group_size[ticket] = group_size
    ticket_to_group_id[ticket] = group_id
    ticket_to_group_num_survived[ticket] = group[group['Survived'] == 1]['Survived'].count()
    ticket_to_group_num_perished[ticket] = group[group['Survived'] == 0]['Survived'].count()
    group_id += 1
    
# Apply the mapping we've just defined to create the GroupID and GroupType variables
dataset['GroupID'] = dataset['Ticket'].map(ticket_to_group_id)
dataset['GroupSize'] = dataset['Ticket'].map(ticket_to_group_size)
dataset['GroupType'] = dataset['Ticket'].map(ticket_to_group_type)
dataset['GroupNumSurvived'] = dataset['Ticket'].map(ticket_to_group_num_survived)
dataset['GroupNumPerished'] = dataset['Ticket'].map(ticket_to_group_num_perished)
dataset['GroupSurvivalIndex'] = (((dataset['Ticket'].map(ticket_to_group_num_survived) - dataset['Ticket'].map(ticket_to_group_num_perished)) / dataset['Ticket'].map(ticket_to_group_size)) + 1)/2


# Let's print the first 4 group entries to check that our grouping was successful
counter = 1
break_point = 4
feature_list = ['Surname', 'SibSp', 'Parch', 'FamilySize', 'Ticket', 'GroupID', 'GroupType', 'GroupSize', 'Cabin', 'Embarked']
print ('Printing Sample Data Entries to Verify Grouping:\n')
for (ticket, group) in dataset.groupby('Ticket'):
    print ('\n', group[feature_list])
    if counter == break_point:
        break
    counter += 1
    
# Let's also check that GroupNumSurvived and GroupNumPerished were created accurately
feature_list = ['GroupID', 'GroupSize', 'Survived', 'GroupNumSurvived', 'GroupNumPerished', 'GroupSurvivalIndex']
dataset[feature_list].sort_values(by=['GroupID']).head(15)


Printing Sample Data Entries to Verify Grouping:


     Surname  SibSp  Parch  FamilySize  Ticket  GroupID  GroupType  GroupSize  \
257  Cherry      0      0           1  110152        1  NonFamily          3   
504  Maioni      0      0           1  110152        1  NonFamily          3   
759  Rothes      0      0           1  110152        1  NonFamily          3   

    Cabin Embarked  
257   B77        S  
504   B79        S  
759   B77        S  

      Surname  SibSp  Parch  FamilySize  Ticket  GroupID GroupType  GroupSize  \
262  Taussig      1      1           3  110413        2    Family          3   
558  Taussig      1      1           3  110413        2    Family          3   
585  Taussig      0      2           3  110413        2    Family          3   

    Cabin Embarked  
262   E67        S  
558   E67        S  
585   E68        S  

       Surname  SibSp  Parch  FamilySize  Ticket  GroupID  GroupType  \
110    Porter      0      0           1  110465        3  NonFa

Unnamed: 0,GroupID,GroupSize,Survived,GroupNumSurvived,GroupNumPerished,GroupSurvivalIndex
504,1,3,1.0,3,0,1.0
257,1,3,1.0,3,0,1.0
759,1,3,1.0,3,0,1.0
585,2,3,1.0,2,1,0.666667
262,2,3,0.0,2,1,0.666667
558,2,3,1.0,2,1,0.666667
110,3,2,0.0,0,2,0.0
475,3,2,0.0,0,2,0.0
335,4,1,,0,0,0.5
158,5,1,,0,0,0.5


In [12]:
# Checking for inconsistences

# Check for cases where FamilySize = 1 but Grouptype = Family
data_reduced = dataset[dataset['FamilySize'] == 1]
data_reduced = data_reduced[data_reduced['GroupType'] == 'Family']

# nri = 'NumRelatives inconsistency'
nri_passenger_ids = data_reduced['PassengerId'].values
nri_unique_surnames = set(data_reduced['Surname'].values)

# How many occurences?
print ('Number of nri Passengers: ', len(nri_passenger_ids))
print ('Numeber of Unique nri Surnames: ', len(nri_unique_surnames))

# We will find that there are only 7 occurrences, so let's go ahead and view them here:
data_reduced = data_reduced.sort_values('Name')
data_reduced[['Name', 'Ticket', 'Fare', 'Pclass', 'Parch', 'SibSp', 'GroupID', 'GroupSize', 'GroupType']].head(int(len(nri_passenger_ids)))


Number of nri Passengers:  7
Numeber of Unique nri Surnames:  4


Unnamed: 0,Name,Ticket,Fare,Pclass,Parch,SibSp,GroupID,GroupSize,GroupType
83,"Carrau, Mr. Francisco M",113059,47.1,1,0,0,36,2,Family
403,"Carrau, Mr. Jose Pedro",113059,47.1,1,0,0,36,2,Family
538,"Risien, Mr. Samuel Beard",364498,14.5,3,0,0,588,2,Family
382,"Risien, Mrs. Samuel (Emma)",364498,14.5,3,0,0,588,2,Family
362,"Ware, Mrs. John James (Florence Louise Long)",CA 31352,21.0,2,0,0,777,2,Family
120,"Watt, Miss. Bertha J",C.A. 33595,15.75,2,0,0,765,2,Family
161,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Mi...",C.A. 33595,15.75,2,0,0,765,2,Family


In [13]:
# Check for cases where FamilySize > 1 but GroupType = NonFamily
data_reduced = dataset[dataset['FamilySize'] > 1]
data_reduced = data_reduced[data_reduced['GroupType'] == 'NonFamily']

# ngwr = 'not group with relatives'
ngwr_passenger_ids = data_reduced['PassengerId'].values
ngwr_unique_surnames = set(data_reduced['Surname'].values)

# How many occurences?
print ('Number of ngwr Passengers: ', len(ngwr_passenger_ids))
print ('Numeber of Unique ngwr Surnames: ', len(ngwr_unique_surnames))

feature_list = ['PassengerId', 'Name', 'Ticket', 'Fare', 'Pclass', 'Parch', 'SibSp', 'GroupID', 'GroupSize', 'GroupType']
data_reduced[feature_list].sort_values('GroupID').head(int(len(ngwr_unique_surnames)))


Number of ngwr Passengers:  17
Numeber of Unique ngwr Surnames:  17


Unnamed: 0,PassengerId,Name,Ticket,Fare,Pclass,Parch,SibSp,GroupID,GroupSize,GroupType
166,167,"Chibnall, Mrs. (Edith Martha Bowerman)",113505,55.0,1,1,0,39,2,NonFamily
356,357,"Bowerman, Miss. Elsie Edith",113505,55.0,1,1,0,39,2,NonFamily
879,880,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",11767,83.1583,1,1,0,76,3,NonFamily
150,1042,"Earnshaw, Mrs. Boulton (Olive Potter)",11767,83.1583,1,1,0,76,3,NonFamily
571,572,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",11769,51.4792,1,0,2,77,2,NonFamily
356,1248,"Brown, Mrs. John Murray (Caroline Lane Lamson)",11769,51.4792,1,0,2,77,2,NonFamily
34,926,"Mock, Mr. Philipp Edmund",13236,57.75,1,0,1,92,2,NonFamily
122,1014,"Schabert, Mrs. Paul (Emma Mock)",13236,57.75,1,0,1,92,2,NonFamily
275,276,"Andrews, Miss. Kornelia Theodosia",13502,77.9583,1,0,1,93,3,NonFamily
765,766,"Hogeboom, Mrs. John C (Anna Andrews)",13502,77.9583,1,0,1,93,3,NonFamily


In [14]:
# Manually correcting some mislabeled group types
# note: ig group size is greater than the number of listed names above, we assign to Mixed
passenger_ids_toFamily = [167, 357, 572, 1248, 926, 1014, 260, 881, 592, 497]
passenger_ids_toMixed = [880, 1042, 275, 766]

dataset['GroupType'][dataset['PassengerId'].isin(passenger_ids_toFamily)] = 'Family'
dataset['GroupType'][dataset['PassengerId'].isin(passenger_ids_toMixed)] = 'Mixed'

# for verification:

# feature_list = ['PassengerId', 'Name', 'GroupID', 'GroupSize', 'GroupType']
# dataset[feature_list][dataset['PassengerId'].isin(passenger_ids_toFamily)].sort_values('GroupID').head(len(passenger_ids_toFamily))
# dataset[feature_list][dataset['PassengerId'].isin(passenger_ids_toMixed)].sort_values('GroupID').head(len(passenger_ids_toMixed))




In [16]:
dataset['LargeGroup'] = np.where(dataset['GroupSize'] > 4, 1, 0)

In [17]:
# creation of Age bins

bin_thresholds = [0, 15, 30, 40, 59, 90]
bin_labels = ['0-15', '16-30', '31-40', '41-59', '60+']
dataset['AgeBin'] = pd.cut(dataset['Age'], bins=bin_thresholds, labels=bin_labels)

In [40]:
# split the fare based on GroupSize; express as fare-per-passenger on a shared ticket

dataset['SplitFare'] = dataset.apply(lambda row: row['Fare']/row['GroupSize'], axis=1)

# dataset['SplitFare'] = dataset['Fare']/dataset['GroupSize'] mesmo resultado

# Verify new feature definition
feature_list = ['GroupSize', 'Fare', 'SplitFare']
dataset[feature_list].head()

# Map to log10 scale
dataset['log10Fare'] = np.log10(dataset['Fare'].values + 1)
dataset['log10SplitFare'] = np.log10(dataset['SplitFare'].values +1)

# Verify new feature definition
feature_list = ['GroupSize', 'Fare', 'SplitFare', 'log10Fare', 'log10SplitFare']
dataset[feature_list].head()


Unnamed: 0,GroupSize,Fare,SplitFare,log10Fare,log10SplitFare
0,1,7.25,7.25,0.916454,0.916454
1,2,71.2833,35.64165,1.859038,1.563975
2,1,7.925,7.925,0.950608,0.950608
3,2,53.1,26.55,1.733197,1.440122
4,1,8.05,8.05,0.956649,0.956649
