# Titanic Modeling: Basic Version -- 4 MODELS
**by Linh Toan**<br/>
**Data Analytics @ Newman University**

**Data:** A previously cleaned version of [the Titanic data set from Kaggle](https://www.kaggle.com/c/titanic/overview).

**This Notebook:** This is crafted as a demonstration of a standard machine learning training and testing process.

**Contents:**
1. Read and Review Data
2. Prepare Data Splits
3. Train Models
4. Test Models

In [0]:
# Essential Libraries
import numpy as np
import pandas as pd

# Libraries for Machine Learning Process
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# 1. Read and Review Data

This data has been cleaned in a previous EDA and preparation process.

In [0]:
# Read cleaned version of the data
df = pd.read_csv('data/titanic_refined.csv')
df.head(10)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,family_count,cabin_ind,Sex_female,Sex_male
0,0,3,22.0,1,0,7.25,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,1,1,0
2,1,3,26.0,0,0,7.925,0,0,1,0
3,1,1,35.0,1,0,53.1,1,1,1,0
4,0,3,35.0,0,0,8.05,0,0,0,1
5,0,3,29.215059,0,0,8.4583,0,0,0,1
6,0,1,54.0,0,0,51.8625,0,1,0,1
7,0,3,2.0,3,1,21.075,4,0,0,1
8,1,3,27.0,0,2,11.1333,2,0,1,0
9,1,2,14.0,1,0,30.0708,1,0,1,0


In [0]:
# Dataframe fundamental info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 857 entries, 0 to 856
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      857 non-null    int64  
 1   Pclass        857 non-null    int64  
 2   Age           857 non-null    float64
 3   SibSp         857 non-null    int64  
 4   Parch         857 non-null    int64  
 5   Fare          857 non-null    float64
 6   family_count  857 non-null    int64  
 7   cabin_ind     857 non-null    int64  
 8   Sex_female    857 non-null    int64  
 9   Sex_male      857 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 67.1 KB


# 2. Use PCA for feature engineering on Fare, Pclass, SibSp, and Parch

In [0]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [0]:
df1 = df[['Fare', 'Pclass']]
df2 = df[['SibSp', 'Parch']]

In [0]:
scaling1 = StandardScaler().fit(df1)
Scaled_data = scaling1.transform(df1)
pca1 = PCA(n_components=1)
pca1.fit(Scaled_data)
x1 = pca1.transform(Scaled_data)
print(x1.shape)

(857, 1)


In [0]:
scaling2 = StandardScaler().fit(df2)
Scaled_data = scaling2.transform(df2)
pca2 = PCA(n_components=1)
pca2.fit(Scaled_data)
x2 = pca2.transform(Scaled_data)
print(x2.shape)

(857, 1)


# 3. Add these columns to the dataframe and remove Fare, Pclass, SibSp, Parch

In [0]:
df['pca_fare_pclass'] = x1
df['pca_sibsp_parch'] = x2
df.drop(columns=['Fare', 'Pclass', 'SibSp', 'Parch'], axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Age,family_count,cabin_ind,Sex_female,Sex_male,pca_fare_pclass,pca_sibsp_parch
0,0,22.0,1,0,0,1,-1.033592,-0.042061
1,1,38.0,1,1,1,0,2.223232,-0.042061
2,1,26.0,0,0,1,0,-1.017431,-0.680297
3,1,35.0,1,1,1,0,1.787875,-0.042061
4,0,35.0,0,0,0,1,-1.014438,-0.680297


# 4. Prepare Data Splits

In [0]:
# features — all columns except target variable
features = df.drop('Survived', axis=1)

# labels — only the target variable column
labels = df['Survived']

In [0]:
# Create Train and Test Splits
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Report Number and Proportion of Train and Test Features and Labels
print(f'Train Split: {X_train.shape[0]} Records, {len(y_train)} Labels = {round(len(y_train)/len(labels), 4) * 100}%')
print(f'Test Split: {X_test.shape[0]} Records, {len(y_test)} Labels = {round(len(y_test)/len(labels), 4) * 100}%')

Train Split: 685 Records, 685 Labels = 79.93%
Test Split: 172 Records, 172 Labels = 20.07%


# 5. Train Models

In [0]:
# Define the model
models = [LogisticRegression(), 
          DecisionTreeClassifier(), 
          RandomForestClassifier(), 
          GradientBoostingClassifier()
         ]

# Train the model using the training features and labels
for model in models:
    model.fit(X_train, y_train)
    # Report trained model
    print(f'Trained and ready: {model}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Trained and ready: LogisticRegression()
Trained and ready: DecisionTreeClassifier()
Trained and ready: RandomForestClassifier()
Trained and ready: GradientBoostingClassifier()


# 6. Test Models

### 4 model results with PCA on Fare, Pclas, SibSp, Parch

In [0]:
# Test all models on the test split
for model in models:
    # Use the model to generate predictions for the Test split, based on its features only
    y_pred = model.predict(X_test)

    # Compare model's predictive performance to the provided test labels
    score = accuracy_score(y_test, y_pred) * 100

    # Report the model and its score
    print(model)
    print(f'  {score}\n')

LogisticRegression()
  78.48837209302324

DecisionTreeClassifier()
  73.83720930232558

RandomForestClassifier()
  77.90697674418605

GradientBoostingClassifier()
  81.3953488372093



### 4 model results without PCA

In [0]:
# Test all models on the test split
for model in models:
    # Use the model to generate predictions for the Test split, based on its features only
    y_pred = model.predict(X_test)

    # Compare model's predictive performance to the provided test labels
    score = accuracy_score(y_test, y_pred) * 100

    # Report the model and its score
    print(model)
    print(f'  {score}\n')

LogisticRegression()
  81.00558659217877

DecisionTreeClassifier()
  75.97765363128491

RandomForestClassifier()
  78.77094972067039

GradientBoostingClassifier()
  80.44692737430168



### 4 model results with PCA on cleaned dataset

In [0]:
df_pca = pd.read_csv('data/titanic_cleaned.csv')
# df_pca = pd.read_csv('data/titanic_refined.csv')
df_pca.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,cabin_ind
0,0,3,0,22.0,7.25,0
1,1,1,1,38.0,71.2833,1
2,1,3,1,26.0,7.925,0
3,1,1,1,35.0,53.1,1
4,0,3,0,35.0,8.05,0


In [0]:
from scipy.linalg import svd

In [0]:
U, sig, VT = svd(df_pca)
sig

array([1857.22048656,  782.87230904,   42.16435788,   18.16553532,
         10.20676565,    8.8794932 ])

In [0]:
df_svd = df_pca.copy()
# df_svd.drop(columns=['cabin_ind', 'Sex_female', 'Sex_male'], axis=1, inplace=True)
df_svd.drop(columns=['cabin_ind', 'Fare'], axis=1, inplace=True)

In [0]:
# features — all columns except target variable
features = df_svd.drop('Survived', axis=1)

# labels — only the target variable column
labels = df_svd['Survived']

In [0]:
# Create Train and Test Splits
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Report Number and Proportion of Train and Test Features and Labels
print(f'Train Split: {X_train.shape[0]} Records, {len(y_train)} Labels = {round(len(y_train)/len(labels), 4) * 100}%')
print(f'Test Split: {X_test.shape[0]} Records, {len(y_test)} Labels = {round(len(y_test)/len(labels), 4) * 100}%')

Train Split: 712 Records, 712 Labels = 79.91%
Test Split: 179 Records, 179 Labels = 20.09%


In [0]:
# Define the model
models = [LogisticRegression(), 
          DecisionTreeClassifier(), 
          RandomForestClassifier(), 
          GradientBoostingClassifier()
         ]

# Train the model using the training features and labels
for model in models:
    model.fit(X_train, y_train)
    # Report trained model
    print(f'Trained and ready: {model}')

Trained and ready: LogisticRegression()
Trained and ready: DecisionTreeClassifier()
Trained and ready: RandomForestClassifier()
Trained and ready: GradientBoostingClassifier()


In [0]:
# Test all models on the test split
for model in models:
    # Use the model to generate predictions for the Test split, based on its features only
    y_pred = model.predict(X_test)

    # Compare model's predictive performance to the provided test labels
    score = accuracy_score(y_test, y_pred) * 100

    # Report the model and its score
    print(model)
    print(f'  {score}\n')

LogisticRegression()
  81.00558659217877

DecisionTreeClassifier()
  77.6536312849162

RandomForestClassifier()
  78.77094972067039

GradientBoostingClassifier()
  79.88826815642457



#### After some testing, it seems the best results across the 4 models is to use the "titanic_cleaned" dataset with PCA via SVD to see the singular values and manually choosing which columns to remove. Oddly, the "titanic_refined" dataset scored lower across the board, and removing the "Age" feature or keeping the "Fare" feature also lowered model accuracy across the board.