In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Loading the passenger data


In [23]:
passengers = pd.read_csv('titanic_data.csv')


Given the saying, “women and children first,” Sex and Age seem like good features to predict survival. 

I'm going to update sex column to numerical where 'fimale' is replaced with 1 and 'male' with 0. 

In [24]:
passengers.Sex = passengers['Sex'].replace('female', '1', regex=True)

passengers.Sex = passengers['Sex'].replace('male', '0', regex=True)
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [25]:
passengers.Sex = pd.to_numeric(passengers.Sex)

passengers.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Checking if there are any empty rows in Age and Sex columns

In [26]:
passengers['Age'].isnull().values.any()

True

In [27]:
passengers['Age'] = passengers['Age'].fillna(passengers.Age.mean())


In [28]:
passengers['Sex'].isnull().values.any()

False

Given the strict class system onboard the Titanic, I'm going to utilise the Pclass column to create a first and second class columns as another features 


In [29]:
passengers['FirstClass'] = passengers.Pclass.apply(lambda x: 1 if x ==1 else 0)

In [30]:
passengers['SecondClass'] = passengers.Pclass.apply(lambda x: 1 if x ==2 else 0)
passengers.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0


In [31]:
features = passengers[['Sex', 'Age', 'FirstClass','SecondClass']]
survival = passengers['Survived']

Spliting the data into the training and test sets

In [32]:

train_features, test_features,train_survival, test_survival = train_test_split(features,survival)

Since sklearn‘s Logistic Regression implementation uses Regularization, we need to scale our feature data.


In [33]:

scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)


Creating and training the model.
Fitting the model will perform gradient descent to find the feature coefficients that minimize the log-loss for the training data.

In [34]:

model = LogisticRegression()
model.fit(train_features, train_survival)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Scoring the model on the training data will run the data through the model and make final classifications on survival for each passenger in the training set. The score returned is the percentage of correct classifications, or the accuracy.


In [35]:
model.score(train_features, train_survival)





0.7964071856287425

the score of our training model is 0.79 which means that its accuracy is good.

Similarly, scoring the model on the testing data will run the data through the model and make final classifications on survival for each passenger in the test set.

In [36]:
model.score(test_features, test_survival)

0.757847533632287

We can see that the test model's accuracy equals to 0,79 which means that our model works quite well.

Let's print the feature coefficients determined by the model. 

In [37]:
list(zip(['Sex','Age','FirstClass','SecondClass'],model.coef_[0]))

[('Sex', 1.2823869045527696),
 ('Age', -0.2713178193783162),
 ('FirstClass', 0.94159089204765),
 ('SecondClass', 0.4287888120319868)]

We can see that sex and belonging to the first class are the most important features in predicting survival on the sinking of the Titanic

I'm going to use our model to make predictions on the survival of a few fateful passengers. 

The arrays below store 4 feature values, in the following order:

Sex, represented by a 0 for male and 1 for female

Age, represented as an integer in years

FirstClass, with a 1 indicating the passenger is in first class

SecondClass, with a 1 indicating the passenger is in second class

In [38]:

Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
Me = np.array([1.0,28.0,0.0,0.0])

Combining passenger arrays

In [39]:

sample_passengers = np.array([Jack, Rose,Me])


Since our Logistic Regression model was trained on scaled feature data,the feature data we are making predictions on also need to de scaled.I'm going to use the StandardScaler object created earlier to normalise the features data.

In [40]:

scaler.transform(sample_passengers)


array([[-0.74154163, -0.74526957, -0.57043565, -0.51492865],
       [ 1.34854195, -0.98053492,  1.75304613, -0.51492865],
       [ 1.34854195, -0.1178953 , -0.57043565, -0.51492865]])

Let's make survival predictions!

In [41]:

model.predict(sample_passengers)



array([0, 0, 0])

Looks like I would have had all the chances to survive that trip. 

But I want to see the probabilities that led to these predictions.
The 1st column is the probability of a passenger perishing on the Titanic, and the 2nd column is the probability of a passenger surviving the sinking 

In [42]:
model.predict_proba(sample_passengers)

array([[9.97674075e-01, 2.32592503e-03],
       [9.53619143e-01, 4.63808566e-02],
       [9.99041770e-01, 9.58229580e-04]])