In [34]:
# Data manipulation tool
import pandas as pd
# Scientific computing 
import numpy as np
# Visualization
import matplotlib.pyplot as plt
# ------------------------------------ Machine Learning 
# Logistic Regression model
from sklearn.linear_model import LogisticRegression
# Split data
from sklearn.model_selection import train_test_split
# Coeff
from sklearn.preprocessing import StandardScaler

In [35]:
passengers = pd.read_csv('passengers.csv')
passengers

Unnamed: 0,PassengerId,Survived,Pclass,Lname,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Braund,Mr. Owen Harris,male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,Cumings,Mrs. John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Heikkinen,Miss. Laina,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,Futrelle,Mrs. Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,Allen,Mr. William Henry,male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,152,1,1,Pears,Mrs. Thomas (Edith Wearne),female,22.0,1,0,113776,66.6000,C2,S
152,153,0,3,Meo,Mr. Alfonzo,male,55.5,0,0,A.5. 11206,8.0500,,S
153,154,0,3,van Billiard,Mr. Austin Blyler,male,40.5,0,2,A/5. 851,14.5000,,S
154,155,0,3,Olsen,Mr. Ole Martin,male,,0,0,Fa 265302,7.3125,,S


In [36]:
# Update sex column to numerical
passengers.Sex = passengers.Sex.map({'male':0,'female':1})
passengers

Unnamed: 0,PassengerId,Survived,Pclass,Lname,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Braund,Mr. Owen Harris,0,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,Cumings,Mrs. John Bradley (Florence Briggs Thayer),1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Heikkinen,Miss. Laina,1,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,Futrelle,Mrs. Jacques Heath (Lily May Peel),1,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,Allen,Mr. William Henry,0,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,152,1,1,Pears,Mrs. Thomas (Edith Wearne),1,22.0,1,0,113776,66.6000,C2,S
152,153,0,3,Meo,Mr. Alfonzo,0,55.5,0,0,A.5. 11206,8.0500,,S
153,154,0,3,van Billiard,Mr. Austin Blyler,0,40.5,0,2,A/5. 851,14.5000,,S
154,155,0,3,Olsen,Mr. Ole Martin,0,,0,0,Fa 265302,7.3125,,S


In [39]:
passengers['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
151    22.0
152    55.5
153    40.5
154     NaN
155    51.0
Name: Age, Length: 156, dtype: float64

In [40]:
passengers.Age.fillna(value=passengers.Age.mean(),inplace=True)
passengers['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
151    22.000000
152    55.500000
153    40.500000
154    28.141508
155    51.000000
Name: Age, Length: 156, dtype: float64

In [42]:
passengers['Age'][150]

51.0

In [43]:
passengers['FirstClass']  = passengers['Pclass'].apply(lambda x: 1 if x == 1 else 0) 
passengers[['Pclass', 'Name', 'FirstClass']]

Unnamed: 0,Pclass,Name,FirstClass
0,3,Mr. Owen Harris,0
1,1,Mrs. John Bradley (Florence Briggs Thayer),1
2,3,Miss. Laina,0
3,1,Mrs. Jacques Heath (Lily May Peel),1
4,3,Mr. William Henry,0
...,...,...,...
151,1,Mrs. Thomas (Edith Wearne),1
152,3,Mr. Alfonzo,0
153,3,Mr. Austin Blyler,0
154,3,Mr. Ole Martin,0


In [44]:
passengers['SecondClass']  = passengers['Pclass'].apply(lambda x: 1 if x == 2 else 0) 
passengers[['Pclass', 'Name', 'FirstClass', 'SecondClass']]

Unnamed: 0,Pclass,Name,FirstClass,SecondClass
0,3,Mr. Owen Harris,0,0
1,1,Mrs. John Bradley (Florence Briggs Thayer),1,0
2,3,Miss. Laina,0,0
3,1,Mrs. Jacques Heath (Lily May Peel),1,0
4,3,Mr. William Henry,0,0
...,...,...,...,...
151,1,Mrs. Thomas (Edith Wearne),1,0
152,3,Mr. Alfonzo,0,0
153,3,Mr. Austin Blyler,0,0
154,3,Mr. Ole Martin,0,0


In [45]:
features = passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival = passengers.Survived

In [46]:
# Perform train, test, split
features_train, features_test, labels_train,  labels_test = train_test_split(features,survival,test_size=0.25, random_state=42)

In [47]:
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
norm_train_features = scaler.fit_transform(features_train)
norm_test_features = scaler.fit_transform(features_test)

In [48]:
# Create and train the model
model = LogisticRegression()
model.fit(norm_train_features , labels_train)

In [49]:
# Score the model on the train dataset
model.score(norm_train_features , labels_train)

0.8376068376068376

In [50]:
# Score the model on the test dataset
model.score(norm_test_features , labels_test)

0.717948717948718

In [51]:
model.coef_

array([[1.4076292 , 0.01020077, 0.19868511, 0.13076109]])

In [52]:
list(zip(['Sex','Age','FirstClass','SecondClass'],model.coef_[0]))

[('Sex', 1.4076291966481407),
 ('Age', 0.01020076830492815),
 ('FirstClass', 0.19868510997050318),
 ('SecondClass', 0.1307610897905488)]

In [53]:
# Sample passenger features
# Male, 20 years old, No-first class, No-Second class
Jack = np.array([0.0,20.0,0.0,0.0])
# Female, 17 years old, Yes-first class, No-Second class
Rose = np.array([1.0,17.0,1.0,0.0])
# Female, 49 years old, no-first class, No-Second class
John_Doe = np.array([1.0,49.0,0.0,0.0])

In [54]:
# Combine passenger arrays
sample_passengers = np.array([Jack , Rose, John_Doe])
# Scale the sample passenger features
sample_passengers = scaler.transform(sample_passengers)
# Make survival predictions!
print(model.predict(sample_passengers))

[0 1 1]




In [55]:
# Probability 
prob = model.predict_proba(sample_passengers)
print(prob)

[[0.91490653 0.08509347]
 [0.26655386 0.73344614]
 [0.3813008  0.6186992 ]]


In [57]:
prob_df = pd.DataFrame({'Passenger':['Rose', 'Jack', 'John Doe'], '% Likely To Survive':[ val[0]*100 for val in prob], '% Likely To Not-Survive':[ val[1]*100 for val in prob]})
prob_df

Unnamed: 0,Passenger,% Likely To Survive,% Likely To Not-Survive
0,Rose,91.490653,8.509347
1,Jack,26.655386,73.344614
2,Jyoti,38.13008,61.86992
