###Welcome

In this project we will be predicting which passengers survived the Titanic

The dataset used will be the same used in the Kaggle competition for the Titanic Survival

In [1]:
#Let's first import the libraries we will use

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
#We we read in dataset
titanic_df = pd.read_csv('titanic.csv')

In [3]:
#Let's take a look at the columns
print(titanic_df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [4]:
#Now let's take a look at how the training set looks
titanic_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
#We want to replace the males for 0 and females for 1 in order to be able to run a logistic regression algorithm
titanic_df['Sex'] = titanic_df['Sex'].map({'male': 0, 'female': 1})

In [6]:
#Let's see if that worked
titanic_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",0,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,C


In [7]:
#We can see that there are some missing values for the variable Age. We will replace all of these values with the mean and round them
titanic_df['Age'].fillna(inplace = True, value = round(titanic_df['Age'].mean()))
titanic_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",0,30.0,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,C


Great, we will now create a new column for Pclass, meaning the class that passengers were in the Titanic. This new column will be 1 for passengers in first class and 0 for all other passengers, we do this in order to obtain binary values for the existing Pclass column

In [8]:
titanic_df['FirstClass'] = titanic_df['Pclass'].apply(lambda p: 1 if p == 1 else 0)

In [9]:
#Let's check if that worked
titanic_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0
5,6,0,3,"Moran, Mr. James",0,30.0,0,0,330877,8.4583,,Q,0
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,S,1
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,,S,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,S,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,C,0


We will now do the same for passengers in 2 class

In [10]:
titanic_df['SecondClass'] = titanic_df['Pclass'].apply(lambda p: 1 if p == 2 else 0)

In [11]:
titanic_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0
5,6,0,3,"Moran, Mr. James",0,30.0,0,0,330877,8.4583,,Q,0,0
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,S,1,0
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,,S,0,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,S,0,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,C,0,1


Great, let's now select the variables we'll use to build our model

In [12]:
features = titanic_df[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival = titanic_df['Survived']

In [13]:
#We will now split the data
train_features, test_features, train_labels, test_labels = train_test_split(features, survival)

In [14]:
#In order to run a logistic regression algorithm we must first scale the feature data. We do this by simply calling in the StandardScaler library from Scikit Learn and applying it to to the features and survival variables we previously created
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.fit_transform(test_features)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Excellent, we will now create a Logistic Regression Model which will find the coefficients of the features that minimize the log loss for our training data, performing gradient descent

In [15]:
logit_regr = LogisticRegression()
logit_regr.fit(train_features, train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
#Print the training score
print(logit_regr.score(train_features, train_labels))

0.8008982035928144


###Great we achieved an 80.00% Accuracy

In [18]:
print(logit_regr.score(test_features, test_labels))

0.757847533632287


And a 78.47% for the test score

Ok let's now see which feature is most important in predicting survival of the Titanic, we do this by printing the feature coefficients. We want to see how significant each variable was in predicting the survival

In [19]:
print(logit_regr.coef_)
#In order these are Sex, Age, FirstClass and SecondClass

[[ 1.3464429  -0.43817329  1.14705273  0.55747746]]


The most significant in determining whether someone survived the Titanic were the Sex and FirstClass features

###Time to play a little bit with our model, lets predict the survival of three different individuals
We will choose the famous Jack and Rose and someone else randomly (we will input the values for this third person)

In [85]:
#Again the values are: Male = 0 and Female = 1
#The second value is age
#The third value is whether the passenger was in first class 1 = Yes and 0 = No
#The fourth value is whether the passenger was in second class 1 = Yes and 0 = No
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
Random_person = np.array([0.0,26.0,0.0,1.0])

In [86]:
passengers = np.array([Jack, Rose, Random_person])

In [87]:
#We have to scale the data like we did in a previous step in order for our model to run
passengers = scaler.transform(passengers)

In [88]:
print(logit_regr.predict(passengers))

[0 1 0]


Alright apparently Jack did not survive and Rose did indeed survived.
The third person did not survive, what if we swap this person with a female instead of a male?

In [95]:
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
Random_person2 = np.array([1.0,26.0,0.0,1.0])
passengers2 = scaler.transform(passengers2)
print(logit_regr.predict(passengers2))

[0 1 1]


Now the third random person survives.
Like we saw in the coefficients Sex is quite significant in predicting the survival of a person

###Finally let's print the probabilities of the first group of passengers surviving

In [96]:
print(logit_regr.predict_proba(passengers))

[[0.90901778 0.09098222]
 [0.03718105 0.96281895]
 [0.75147221 0.24852779]]


Jack has a 90.9% chance of not surviving and only a 9% chance of surviving

Rose has a 3.7% chance of not surviving and a 96% percent chance of surviving

The third random person has a 75% chance of not surviving and a 24.8% chance of surviving