## Machine Learning in Python with SciKit-Learn

In [1]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import minmax_scale

### Loading & Cleaning Data

In [2]:
df = pd.read_csv('Titanic_Processed.csv')

df = df.drop(columns=['Unnamed: 0', 'sex'])
df.head()

Unnamed: 0,pclass,survived,age,Siblings and Spouses Aboard,Parents and Children Aboard,fare,embarked,cabin_letter,is_male,Total Family Aboard
0,1.0,1.0,29.0,0.0,0.0,211.3375,Southampton,B,0,0.0
1,1.0,1.0,0.9167,1.0,2.0,151.55,Southampton,C,1,3.0
2,1.0,0.0,2.0,1.0,2.0,151.55,Southampton,C,0,3.0
3,1.0,0.0,30.0,1.0,2.0,151.55,Southampton,C,1,3.0
4,1.0,0.0,25.0,1.0,2.0,151.55,Southampton,C,0,3.0


In [3]:
df = pd.get_dummies(df, columns=['cabin_letter','embarked'],drop_first=True)
df.head(2)

Unnamed: 0,pclass,survived,age,Siblings and Spouses Aboard,Parents and Children Aboard,fare,is_male,Total Family Aboard,cabin_letter_B,cabin_letter_C,cabin_letter_D,cabin_letter_E,cabin_letter_F,cabin_letter_G,cabin_letter_T,embarked_Queenstown,embarked_Southampton
0,1.0,1.0,29.0,0.0,0.0,211.3375,0,0.0,1,0,0,0,0,0,0,0,1
1,1.0,1.0,0.9167,1.0,2.0,151.55,1,3.0,0,1,0,0,0,0,0,0,1


In [4]:
# Drop rows with NaN values
df = df.dropna()
df.head()

Unnamed: 0,pclass,survived,age,Siblings and Spouses Aboard,Parents and Children Aboard,fare,is_male,Total Family Aboard,cabin_letter_B,cabin_letter_C,cabin_letter_D,cabin_letter_E,cabin_letter_F,cabin_letter_G,cabin_letter_T,embarked_Queenstown,embarked_Southampton
0,1.0,1.0,29.0,0.0,0.0,211.3375,0,0.0,1,0,0,0,0,0,0,0,1
1,1.0,1.0,0.9167,1.0,2.0,151.55,1,3.0,0,1,0,0,0,0,0,0,1
2,1.0,0.0,2.0,1.0,2.0,151.55,0,3.0,0,1,0,0,0,0,0,0,1
3,1.0,0.0,30.0,1.0,2.0,151.55,1,3.0,0,1,0,0,0,0,0,0,1
4,1.0,0.0,25.0,1.0,2.0,151.55,0,3.0,0,1,0,0,0,0,0,0,1


In [5]:
df[:] = minmax_scale(df)
df.head()

Unnamed: 0,pclass,survived,age,Siblings and Spouses Aboard,Parents and Children Aboard,fare,is_male,Total Family Aboard,cabin_letter_B,cabin_letter_C,cabin_letter_D,cabin_letter_E,cabin_letter_F,cabin_letter_G,cabin_letter_T,embarked_Queenstown,embarked_Southampton
0,0.0,1.0,0.361169,0.0,0.0,0.412503,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.009395,0.125,0.333333,0.295806,1.0,0.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.022964,0.125,0.333333,0.295806,0.0,0.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.373695,0.125,0.333333,0.295806,1.0,0.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.311064,0.125,0.333333,0.295806,0.0,0.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Split Data into a testing dataset and a training dataset

In [6]:
# Use a distinct numerical seed so that results are repeatable
seed = 42

In [7]:
# Create two DataFrames out of the source DataFrame 
# with the first getting 90% of the rows and the second getting 10%
train, test = train_test_split(df, test_size=0.1, random_state=seed)

In [8]:
# Drop the label column from the training dataset
X_train = train.drop("survived", axis=1)
Y_train = train["survived"]
X_test  = test

### Fit a model to our training dataset

In [9]:
model = LogisticRegression(random_state=seed, max_iter=50000)
model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=50000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Evaluate our model's performance

In [10]:
# Generate predictions
Y_pred = model.predict(X_test.drop('survived', axis=1))
Y_pred

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1.,
       0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
       0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
       0., 1., 0.])

In [11]:
accuracy = round(model.score(X_train, Y_train) * 100, 2)
accuracy

79.64

### Save the Trained Model
Models can be "pickled" and saved in a binary file for later use, so you don't need to re-train the model every time.

In [12]:
model_file = open('Titanic.pkl', 'wb')
pickle.dump(model, model_file)

model_file.close()