In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# reading the data 
training = pd.read_csv('/kaggle/input/titanic/train.csv')
test  = pd.read_csv('/kaggle/input/titanic/test.csv')
training['train_test'] = 1
test['train_test'] = 0
test['survived'] = np.NaN
all_data = pd.concat([training, test])

%matplotlib inline
all_data.columns

# Project planing

In [None]:
# Steps to be taken for analysis
# Histograms, boxplot
# Value counts
# Missing data
# correlation between metrics
# Explore interesting themes
  # survival due due to wealthiness
  # or due to the location
  # Age scatterplot with ticket price
  # Young and wealthy variable
  # Total spent
# Feature engineering
# Preprocess data or use a transformer
    # Use label fro train and test
# Scaling?
# Model baseline
# Model comparison with CV

# Light data exploration


## a) For numerical variables
 ###  * Make histograms to understand distributions
 ### * Correlation plots
 ### * Pivot table comparing survival rate across numeric variables

## b) For categorical variables
### - Make bar chart to understand balance of classes
### - Make pivot table to understand relationship with survival

In [None]:
# Quick look at data type and null counts
training.info()

In [None]:
# Let's get a better understanding of the data. This gives us an understanding of the central tendencies of the data
training.describe()

In [None]:
# Let's split numerical and categorical variables
df_num = training[['Age', 'SibSp', 'Parch', 'Fare']]
df_cat =  training[['Survived', 'Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked']]

In [None]:
# Distribution for all numerical variables
import matplotlib.pyplot as plt
for col in df_num.columns:
    plt.hist(df_num[col])
    plt.title(col)
    plt.show()

Only the distribution of the age variable looks normaly distributed. Perhaps we may want to normalize the other variables?

In [None]:
# Let's look at some correlations
import seaborn as sns
print(df_num.corr())
sns.heatmap(df_num.corr())

In [None]:
# Let's compare the survival rate across "Age", "SibSp", "Parch" and "Fare". It shows how survival rate differs across different groups
pd.pivot_table(training, index = 'Survived', values = ['Age', 'SibSp', 'Fare', 'Parch'])

In [None]:
# Let's have a look into categorical variables
for col in df_cat.columns:
    sns.barplot(df_cat[col].value_counts().index, df_cat[col].value_counts()).set_title(col)
    plt.show()

In [None]:
# Comparing survival and each of the categorical variable

print(pd.pivot_table(training, index = 'Survived', columns = 'Pclass', values = 'Ticket', aggfunc = 'count'))
print()
print(pd.pivot_table(training, index = 'Survived', columns = 'Sex', values = 'Ticket', aggfunc = 'count'))
print()
print(pd.pivot_table(training, index = 'Survived', columns = 'Embarked', values = 'Ticket', aggfunc = 'count'))
print()

## Feature engineering
### 1. Cabin - Let's check if the purchase of ticket across multiple cabin impacted survival
### 2. Do different ticket types impact the survival rate?
### 3. Does a person title related to survival rate?

In [None]:
# Looking at the barplots above, we see it is likely there might be irrelevant data in the 'Ticket' and 'Cabin' variables 

#df_cat.Cabin
# Let's first split the Cabin according to the letter they carry
training['Cabin_multiple'] = training.Cabin.apply(lambda x : 0 if pd.isna(x) else len(x.split(' ')))
# Now let's look at Cabin by letter

training['Cabin_multiple'].value_counts()

In [None]:
pd.pivot_table(training, index = "Survived", columns = 'Cabin_multiple', values = 'Ticket', aggfunc = 'count')

In [None]:
# Let's create categories based on the cabin letter (n stands for null)
# null will be considered as a category

training['Cabin_adv'] = training.Cabin.apply(lambda x : str(x)[0])
training.head(5)

In [None]:
# Let's compare the survival rate by cabin's type
print(training.Cabin_adv.value_counts())
print(pd.pivot_table(training, index = 'Survived', columns = 'Cabin_adv', values = 'Name', aggfunc = 'count'))
print()

In [None]:
training['Ticket']

In [None]:
# Let's have a better understanding of the ticket numbers (numeric versus non-numeric)

training['numeric_ticket'] = training.Ticket.apply(lambda x : 1 if  x.isnumeric() else 0)
training['non_numeric_ticket'] = training.Ticket.apply(lambda x : ''.join(x.split(' ')[:-1]).replace('.', '').replace('/','').lower() if len(x.split(' ')[:-1])>0 else 0)

In [None]:
training.head(4)

In [None]:
pd.set_option("max_rows", None)      # choose how many rows to display
training['non_numeric_ticket'].value_counts()
#training['numeric_ticket'].value_counts()


In [None]:
# Let's see if the ticket had an impact in the survival rate
pd.pivot_table(training, index = 'Survived', columns = 'numeric_ticket', values = 'Ticket', aggfunc = 'count')
pd.pivot_table(training, index='Survived', columns = 'non_numeric_ticket', values = 'Ticket', aggfunc = 'count')

In [None]:
# Now let's check the impact of a person's title (Mr., Mrs., Master, Miss) in it survival rate
pd.set_option("Max_rows", None)
training.Name.head(10)

In [None]:
training['title_name'] = training['Name'].apply(lambda x : x.split(',')[1].split('.')[0].strip())
training.title_name.value_counts()
pd.pivot_table(training, index ='Survived', columns = 'title_name', values ='Ticket', aggfunc = 'count' )

## Data preprocessing
### 1. drop null value
### 2. include relevant features
### 3. one encode the data
### 4. imputing
### 5. scaling 
### 6. Standard scaling

In [None]:
len(test)

In [None]:
# Let's combine the variables we defined above in the test and training set
all_data = pd.concat([training, test])
all_data['Cabin_multiple'] = all_data.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
all_data['Cabin_adv'] = all_data.Cabin.apply(lambda x :  str(x)[0])
all_data['numeric_ticket'] = all_data.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
all_data['non_numeric_ticket'] = all_data.Ticket.apply(lambda x : ''.join(x.split(' ')[:-1]).replace('.', '').replace('/','').lower() if len(x.split(' ')[:-1])>0 else 0)
all_data['name_title'] = all_data.Name.apply(lambda x : x.split(',')[1].split('.')[0].strip())


In [None]:
# Let's impute Nas
all_data.Age =  all_data.Age.fillna(training.Age.mean())
all_data.Fare = all_data.Fare.fillna(training.median())

In [None]:
all_data.Embarked.isnull().value_counts()
# let drop the null rows
all_data.dropna(subset=['Embarked'], inplace = True)



In [None]:
all_data.Fare.hist()

In [None]:
# Let's log norm the Fare to get a distribution closer to the normal one
all_data['norm_fare'] = np.log(all_data.Fare + 1)
all_data.norm_fare.hist()

In [None]:
# converting fare to categorie for pd.get_dummies()
all_data.Pclass = all_data.Pclass.astype(str)

In [None]:
# Let's create dummy variables from category

all_dummies = pd.get_dummies(all_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'norm_fare', 'Embarked', 'Cabin_adv', 'Cabin_multiple', 'numeric_ticket', 'name_title', 'train_test']])
all_dummies.head(5)

In [None]:
# Let's scale the data
# First let's keep a copy of our data
all_dummies_scaled =  all_dummies.copy()
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()
all_dummies_scaled[['Age', 'SibSp', 'Parch', 'norm_fare']] = SS.fit_transform(all_dummies_scaled[['Age', 'SibSp', 'Parch', 'norm_fare']])


In [None]:
# Let's split all_dummies again into training and test set
X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test==1].drop(['train_test'], axis = 1)
X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop(['train_test'], axis = 1)
y_train = all_data[all_dummies.train_test == 1].Survived
# y_train.shape

We will test the following models : Naive Bayes, Logistic regression, decision tree, k-nearest neighbors, random forest,Support vector classifier, Xtreme gradiant boosting, voting classifier


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier


In [None]:
# Let's use Naive Bayes as the baseline model
gnb = GaussianNB()
cv = cross_val_score(gnb, X_train_scaled, y_train, cv = 5)
print(cv.mean())

In [None]:
# Logistic Regression
lr =  LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr, X_train_scaled, y_train, cv = 5)
print(cv.mean())

In [None]:
# Decision tree
dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt,X_train_scaled, y_train, cv = 5)
print(cv.mean())

In [None]:
# Knearest-neighbors
knn = KNeighborsClassifier()
cv = cross_val_score(knn, X_train_scaled, y_train, cv = 5)
print(cv.mean())

In [None]:
# Random forest
rf = RandomForestClassifier()
cv = cross_val_score(rf, X_train_scaled, y_train, cv = 5)
print(cv.mean())

In [None]:
# Support vector machine
svc  = SVC(probability = True)
cv = cross_val_score(svc, X_train_scaled, y_train, cv = 5)
print(cv.mean())

In [None]:
# xgboost 
xgb =  XGBClassifier(random_state= 1)
cv = cross_val_score(xgb, X_train_scaled, y_train, cv = 5)
print(cv.mean())

In [None]:
# Voting classifier
vc = VotingClassifier(estimators = [('lr', lr), ('knn', knn), ('rf', rf), ('gnb', gnb), ('svc', svc), ('xgb', xgb)], voting ='soft')
cv = cross_val_score(vc, X_train_scaled, y_train, cv = 5)
print(cv.mean())

## Model tuning


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selction import RandomizedSearchCV

In [48]:
# Performance reporting function

def clf_performance(classifier, model_name):
    print(model_name)
    print('best_score :' + str(classifier.best_score_))
    print('Best Parameters :' + str(classifier.best_params_))
    

In [50]:
# Logistic regression
lr = LogisticRegression()
param_grid = {'max_iter': [2000],
             'penalty': ['l1', 'l2'],
             'C': np.logspace(-4,4,20),
             'solver': ['liblinear']}
clf_lr = GridSearchCV(lr, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_lr = clf_lr.fit(X_train_scaled, y_train)
clf_performance(best_clf_lr, 'Logistic Regression')

In [52]:
# K-nearest neighbors

knn = KNeighborsClassifier()
param_grid = {'n_neighbors': [3,5,7,9],
             'weights': ['uniform', 'distance'],
             'algorithm': ['auto', 'ball_tree', 'kd_tree'],
             'p': [1,2]}
clf_knn = GridSearchCV(knn, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_knn = clf_knn.fit(X_train_scaled, y_train)
clf_performance(best_clf_knn, 'K-nearest neighbors')

In [81]:
# Support vector machine

"""svc = SVC(probability = True)
param_grid = [{'kernel' : ['rbf'], 'gamma':[.1, .5, 1, 2, 5, 10], 
              'C': [.1,1,10]},
             {'kernel' : ['linear'], 'gamma':[.1,.5, 1, 2, 5, 10], 'C':[.1, 1, 10]},
             {'kernel': ['poly'], 'degree': [2,3,4,5], 'C': [.1,1,10]}]
clf_svc =  GridSearchCV(svc, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_svc = clf_svc.fit(X_train_scaled, y_train)
clf_performance(best_clf_svc, 'SVM')

"""

In [62]:
# Since the number of feature is large, let's use randomized search to narrow down the parameters of the model (shortens a bit the work flow)
from sklearn.model_selection import RandomizedSearchCV

rf =  RandomForestClassifier(random_state = 1)
param_distributions =  {'n_estimators': [100, 500, 100], 
              'bootstrap' : [True, False],
              'max_depth' : [3,5,10,20, 50, 75, 100, None],
              'max_features': ['auto', 'sqrt'],
              'min_samples_leaf': [1,2,4,10],
              'min_samples_split' : [2, 5, 10]}
clf_rf_rnd = RandomizedSearchCV(rf, param_distributions =  param_distributions, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf_rnd = clf_rf_rnd.fit(X_train_scaled, y_train)
clf_performance(best_clf_rf_rnd, 'Random Forest')

In [60]:
rf.get_params()

In [63]:
# let's use the best parameters to find the best score

rf = RandomForestClassifier(random_state = 1)
param_grid = {'n_estimators': [400, 450, 500],
             'criterion': ['gini', 'entropy'],
             'bootstrap' : [True],
             'max_depth' : [15, 20, 25],
             'max_features' : ['auto', 'sqrt', 10],
             'min_samples_leaf' : [2,3],
             'min_samples_split' : [2,3]}
clf_rf = GridSearchCV(rf, param_grid =  param_grid, cv =5, verbose =  True, n_jobs = -1)
best_clf_rf = clf_rf.fit(X_train_scaled, y_train)
clf_performance(best_clf_rf, 'Random Forest')

In [66]:
best_rf =  best_clf_rf.best_estimator_.fit(X_train_scaled, y_train)
feature_importances = pd.Series(best_rf.feature_importances_, index = X_train_scaled.columns)
feature_importances.nlargest(20).plot(kind ='barh')

In [None]:
# Let's check with XGboost
"""xgb = XGBClassifier(random_state = 1)

param_grid= {'n_estimators' :[20, 50, 100],
            'colsample_bytree' : [0.2, .5, .7, .8, 1],
            'max_depth': [2, 5, 10, 15],
            'reg_alpha': [0, .5, 1],
            'reg_lambda' : [1, 1.5, 2],
            'subsample' : [0.5, .6,.8],
            'learning_rate': [0.01, 0.1, .8],
            'gamma' :[.01, .1, 10],
            'min_child_weight': [0, .01, .1, 1, 10],
            'sampling_method': ['uniform', 'gradient_based']}
clf_xgb = GridSearchCV(xgb, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_xgb = clf_xgb.fit(X_train_scaled, y_train)
clf_performance(best_clf_xgb, 'XGBoot')

"""

In [70]:
xgb = XGBClassifier(random_state = 1)

param_grid= {'n_estimators' :[400, 450, 500],
            'colsample_bytree' : [0.2, .5, .7, .8],
            'max_depth': [None],
            'reg_alpha': [1],
            'reg_lambda' : [2,5,10],
            'subsample' : [0.55, .6,.65],
            'learning_rate': [0.5],
            'gamma' :[.5, 1, 2],
            'min_child_weight': [.01],
            'sampling_method': ['uniform']
            }
clf_xgb = GridSearchCV(xgb, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_xgb = clf_xgb.fit(X_train_scaled, y_train)
clf_performance(best_clf_xgb, 'XGBoot')

In [77]:
y_hat_xgb = best_clf_xgb.best_estimator_.predict(X_test_scaled) 
xbg_submission = {'PassengerId': test.PassengerId, 'Survived': y_hat_xgb}
submission_xgb = pd.DataFrame(data = xbg_submission)
submission_xgb.to_csv('submission_xgb1.csv', index = False)
table = pd.read_csv('submission_xgb1.csv')
table.head(10)

In [84]:
# best scores for other models

best_lr = best_clf_lr.best_estimator_
best_knn = best_clf_knn.best_estimator_
#best_svc = best_clf_svc.best_estimator_
best_rf = best_clf_rf.best_estimator_
best_xgb = best_clf_xgb.best_estimator_

voting_clf_hard = VotingClassifier(estimators = [('knn', best_knn), ('rf', best_rf), ('lr', best_lr)],  voting = 'hard')
voting_clf_soft = VotingClassifier(estimators = [('knn', best_knn), ('rf', best_rf), ('lr', best_lr)],  voting = 'soft')
voting_clf_soft_xgb = VotingClassifier(estimators = [('knn', best_knn), ('rf', best_rf), ('lr', best_lr), ('xbg', best_xgb)], voting = 'soft')

print('voting_clf_hard :' ,cross_val_score(voting_clf_hard, X_train_scaled, y_train, cv = 5))
print('voting_clf_hard mean :' ,cross_val_score(voting_clf_hard, X_train_scaled, y_train, cv = 5).mean())

print('voting_clf_soft:' ,cross_val_score(voting_clf_soft, X_train_scaled, y_train, cv = 5))
print('voting_clf_soft mean :' ,cross_val_score(voting_clf_soft, X_train_scaled, y_train, cv = 5).mean())

print('voting_clf_soft_xgb: ' ,cross_val_score(voting_clf_soft_xgb, X_train_scaled, y_train, cv = 5))
print('voting_clf_soft_xgb mean:' ,cross_val_score(voting_clf_soft_xgb, X_train_scaled, y_train, cv = 5).mean())


In [90]:
# using the soft classifier let's see the impact of the weights on some models
"""
params = {'weights' : [[1,1,1], [1,2,1], [1,1,2], [2,1,1]]}
vote_weights = GridSearchCV(voting_clf_soft, param_grid =  params,  cv =5, verbose = True, n_jobs = -1)
best_clf_weight =  vote_weights.fit(X_train_scaled, y_train)
clf_performance(best_clf_weight, 'VC weights')
voting_clf_sub = best_clf_weight.best_estimator_.predict(X_test_scaled)

"""

In [94]:
# Let's make the predictions

voting_clf_hard.fit(X_train_scaled, y_train)
voting_clf_soft.fit(X_train_scaled, y_train)
voting_clf_soft_xgb.fit(X_train_scaled, y_train)
best_rf.fit(X_train_scaled, y_train)


y_hat_vc_hard = voting_clf_hard.predict(X_test_scaled)