In [195]:
import numpy as np

import pandas as pd
from pandas.api.types import CategoricalDtype

from sklearn.feature_extraction import DictVectorizer

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Titanic Surviving Predict

This project is from Kaggle: https://www.kaggle.com/c/titanic/overview


#### All following code is 100% from Richard Xue.

## 1. Question Framing

The sinking of the Titanic is one of the most infamous shipwrecks in history.

On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.

While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.

Is it possible to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc)?

Objective: The models I decide to use to predict are: 
- Feature Engineering

## 2. Loading data

We have 2 datasets: train and test. 
- train.csv contains passenger data, including death or alive
- test.csv contrains passenger data, but we don't know if they are alive or not

#### Data Explanation

In [196]:
# Survived      Survival        0 = No, 1 = Yes
# Pclass        Ticket class    1 = 1st, 2 = 2nd, 3 = 3rd
# Sex           Sex
# Age           Age             in years
# SibSp         # of siblings / spouses aboard the Titanic
# Parch         # of parents / children aboard the Titanic
# Ticket        Ticket number
# Fare          Passenger fare
# Cabin         Cabin number
# Embarked      Port of Embarkation     C = Cherbourg, Q = Queenstown, S = Southampton

In [197]:
original_train = pd.read_csv('train.csv')
print(original_train.shape)
original_train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [198]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## 3. Data Cleaning 

In [199]:
original_train = pd.read_csv('train.csv')

In [200]:
# We first deal with null values in our dataframe

def null_columns(df):
    """
    Prints the column name if this column contains NaN.
    
    Input: df - target dataframe
    Output: column names
    
    """
    for col in df.columns:
        if df.isna()[col].any():
            print(col, type(col))

print('Train contains NaN in the following columns: ')
null_columns(original_train)

Train contains NaN in the following columns: 
Age <class 'str'>
Cabin <class 'str'>
Embarked <class 'str'>


In [201]:
print('Test contains NaN in the following columns: ')
null_columns(test)

Test contains NaN in the following columns: 
Age <class 'str'>
Fare <class 'str'>
Cabin <class 'str'>


In [202]:
# We see multiple columns contain NaN. Let's fix them one by one.

# 1. Age
original_train['Age'].unique() # Check what the data looks like

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [203]:
# 2. Cabin
original_train['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [204]:
# 3. Embarked
original_train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [205]:
# 4. Fare
test['Fare'].unique()

array([  7.8292,   7.    ,   9.6875,   8.6625,  12.2875,   9.225 ,
         7.6292,  29.    ,   7.2292,  24.15  ,   7.8958,  26.    ,
        82.2667,  61.175 ,  27.7208,  12.35  ,   7.225 ,   7.925 ,
        59.4   ,   3.1708,  31.6833,  61.3792, 262.375 ,  14.5   ,
        61.9792,  30.5   ,  21.6792,  31.5   ,  20.575 ,  23.45  ,
        57.75  ,   8.05  ,   9.5   ,  56.4958,  13.4167,  26.55  ,
         7.85  ,  13.    ,  52.5542,  29.7   ,   7.75  ,  76.2917,
        15.9   ,  60.    ,  15.0333,  23.    , 263.    ,  15.5792,
        29.125 ,   7.65  ,  16.1   ,  13.5   ,   7.725 ,  21.    ,
         7.8792,  42.4   ,  28.5375, 211.5   ,  25.7   ,  15.2458,
       221.7792,  10.7083,  14.4542,  13.9   ,   7.775 ,  52.    ,
         7.7958,  78.85  ,   7.8542,  55.4417,   8.5167,  22.525 ,
         7.8208,   8.7125,  15.0458,   7.7792,  31.6792,   7.2833,
         6.4375,  16.7   ,  75.2417,  15.75  ,   7.25  ,  23.25  ,
        28.5   ,  25.4667,  46.9   , 151.55  ,  18.    ,  51.8

In [250]:
def fix_na(data):
    # We fill NaN in age with -1
    data['Age'] = data['Age'].fillna(-1)
    
    # We fill NaN in cabin with Z99
    data['Cabin'] = data['Cabin'].fillna('Z99')
    
    # We fill NaN in Embarked with U for unknown
    data['Embarked'] = data['Embarked'].fillna('U')
    
    # We fill NaN in Fare with -1
    data['Fare'] = data['Fare'].fillna(-1)

fix_na(original_train)
fix_na(test)

## 4. Model Selection

1. Logistic Regression with Feature Engineering

    We want to train our logistic model with numeric values, such as age --> survived.

In [207]:
# Split our data to train and test

from sklearn.model_selection import train_test_split
train, val = train_test_split(original_train, test_size=0.1, random_state=42)
print(train.shape, val.shape)
train.head()

(801, 12) (90, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
165,166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9.0,0,2,363291,20.525,Z99,S
541,542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9.0,4,2,347082,31.275,Z99,S
625,626,0,1,"Sutton, Mr. Frederick",male,61.0,0,0,36963,32.3208,D50,S
388,389,0,3,"Sadlier, Mr. Matthew",male,-1.0,0,0,367655,7.7292,Z99,Q
76,77,0,3,"Staneff, Mr. Ivan",male,-1.0,0,0,349208,7.8958,Z99,S


In [208]:
# To convert categorial data to numeric data, I choose to use One Hot Encoding.

def ohe(data, col):
    
    """
    One-hot-encodes given column.
    
    Input: data - target dataframe
            col - target column
            
    Output: dataframe with target column one-hot-encoded
    """
    
    # Initialize DictVec
    vec_enc = DictVectorizer()
    vec_enc.fit(data[[col]].to_dict(orient='records'))

    # Convert categorial data to numerical data array
    col_data = vec_enc.transform(data[[col]].to_dict(orient='records')).toarray()
    
    # Get feature names (unique values in that column)
    col_cats = vec_enc.get_feature_names()
    
    # Create a dataframe with ohe columns
    col = pd.DataFrame(col_data, columns=col_cats)

    # Reset the index to avoid creating empty rows
    data = data.reset_index()
    
    # Combine by INDEX start from 0
    data = pd.concat([data, col], axis=1)
    
    return data

It's time to fit our updated logistic regression model which consists of numeric variables and our one-hot-encoded variables.

In [242]:
# Model Training

def select_columns(data, *columns):
    """Select only columns passed as arguments."""
    return data.loc[:, columns]

def process_data_gm(data):
    """Process the data for a guided model."""
    # One-hot-encode features
    data = ohe(data, 'Sex')
    data = ohe(data, 'Embarked')
    
    # Transform Data, Select Features
    X = select_columns(data, 
                          'Pclass', 
                          'Age', 
                          'SibSp',
                          'Parch',
                          'Fare',
                          'Sex=female',
                          'Sex=male',
                          'Embarked=C',
                          'Embarked=Q',
                          'Embarked=S'
                         )
    
    # Return predictors and response variables separately
    try:
        y = data['Survived']
    except:
        return X
    
    return X, y

In [245]:
X_train, Y_train = process_data_gm(train)
X_val, Y_val = process_data_gm(val)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, Y_train)
print('Training accuracy: ', model.score(X_train,Y_train))
print('Testing accuracy: ', model.score(X_val,Y_val))

Training accuracy:  0.7952559300873908
Testing accuracy:  0.8222222222222222


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [251]:
# Predict the real data

X_test = process_data_gm(test)
X_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex=female,Sex=male,Embarked=C,Embarked=Q,Embarked=S
0,3,34.5,0,0,7.8292,0.0,1.0,0.0,1.0,0.0
1,3,47.0,1,0,7.0,1.0,0.0,0.0,0.0,1.0
2,2,62.0,0,0,9.6875,0.0,1.0,0.0,1.0,0.0
3,3,27.0,0,0,8.6625,0.0,1.0,0.0,0.0,1.0
4,3,22.0,1,1,12.2875,1.0,0.0,0.0,0.0,1.0


In [253]:
test_predictions = model.predict(X_test)

In [254]:
from datetime import datetime

# Construct and save the submission:
submission_df = pd.DataFrame({
    "PassengerId": test['PassengerId'], 
    "Survived": test_predictions,
}, columns=['PassengerId', 'Survived'])
timestamp = datetime.isoformat(datetime.now()).split(".")[0]
submission_df.to_csv("submission_{}.csv".format(timestamp), index=False)

print('Created a CSV file: {}.'.format("submission_{}.csv".format(timestamp)))
print('You may now upload this CSV file to Kaggle for scoring.')

Created a CSV file: submission_2020-06-11T16:15:59.csv.
You may now upload this CSV file to Kaggle for scoring.
