# ⛴ Titanic Machine Learning Survival Predictions 🚢

## Goals 🥅

- Out goal is build a model that is able to predict if person survives based on the features given. 

## Project Planning 🌱
When starting a project I like tp outline my steps that I plan to take. Below is the rough outline that I created for this project. 

### Plan 📝
1. Understand the shape of the data (Histograms, Box plots, etc.)
  - Histograms and Boxplots
2. Data Cleaning
  - Value Counts
  - Missing Data
3. Data Exploration
  - Correltaion between metrics
  - Eplore Interesting Themes
    - Wealthy survive? 
    - By location
    - Age Scatterplot with ticket price
    - Young and weathly Variables? 
    - Total spent? 
4. Feature Engineering
  - Preprocess Data together or use a transformer?
5. Data Preprocessing for Model
  - Label Test and Train set. 
6. Basic Model Building 
  - Model Baseline
7. Model Tuning
8. Ensemble Model Building
9. Results

## Import some libraries 📚📚


In [2]:
# For the Data Cleaning, Exploration and Manipulation
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
gender_submission_df = pd.read_csv('../kaggle_submissions/gender_submission.csv')

In [6]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [50]:
# make sure to check the shape
# you need to make the features with the test data
train_df.shape

(891, 12)

In [51]:
test_df.shape

(418, 11)

## clean the data

In [52]:
train_df = train_df.drop_duplicates()


In [53]:
train_df.shape

(891, 12)

In [54]:
# we can check the percentage of null data per column, by dividing the sum by the length. 
# sort the values by decending to see where we need to focus the most. 
(train_df.isnull().sum()/len(train_df)).sort_values(ascending=False)

Cabin          0.771044
Age            0.198653
Embarked       0.002245
PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
dtype: float64

In [55]:
# we need to make the data uniform
# a majority of the null data is in the cabin column. 
# drop the cabin column

train_df = train_df.drop(columns="Cabin")
test_df = test_df.drop(columns="Cabin")

In [56]:
# cabin should now be gone from both data sets
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [57]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [58]:
#becasue the null amount is low we can impute the data.
(train_df.isnull().sum()/len(train_df)).sort_values(ascending=False)



Age            0.198653
Embarked       0.002245
PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
dtype: float64

### Imputing Embarked

In [59]:
# embarked has the lowest null count, so using the most frequest woould be the best option
from sklearn.impute import SimpleImputer
impute_embarked =SimpleImputer(strategy='most_frequent')
train_df[['Embarked']] = impute_embarked.fit_transform(train_df[["Embarked"]])

### Imputing Age

In [60]:
# age is a little different
# using Nearest Neighbors would be a better choise for this one.
from sklearn.impute import KNNImputer
impute_age = KNNImputer(n_neighbors=8)
train_df[['Age']] = impute_age.fit_transform(train_df[['Age']])

#### quick check

In [61]:
# make sure that the there are no nulls left. 
(train_df.isnull().sum()/len(train_df)).sort_values(ascending=False)

PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Embarked       0.0
dtype: float64

In [62]:
train_df.shape

(891, 11)

### Target and features

In [63]:
# we need to our target to be survived. 
# the rest are features. 

y = train_df['Survived']
X = train_df.drop(columns=['Survived'])

### Holdout Meathod

In [64]:
# now we split the model and test it. 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [65]:
X_train.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [66]:
# only integers and floats can be used for calulation. 

X_train_num = X_train.select_dtypes(include=['int64','float64'])
X_test_num = X_test.select_dtypes(include=['int64','float64'])

In [67]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [68]:
from sklearn.compose import ColumnTransformer

In [69]:
standard_features = ["Age"]
robust_features = ["Fare"]
minmax_features = ["Pclass", "SibSp", "Parch"]

In [70]:
# it's eaier to do it all at once
# to keep it a pandas DF, use remainder = 'passthrough'
scalers = ColumnTransformer([
    ("standard_scaler", StandardScaler(), standard_features),
    ("robust_scaler", RobustScaler(), robust_features),  
    ("minmax_scaler", MinMaxScaler(), minmax_features),      
]).set_output(transform='pandas')

scalers

In [71]:
# now that the data is scaled, we can train the model
X_train_num_scaled = scalers.fit_transform(X_train_num)
X_test_num_scaled = scalers.fit_transform(X_test_num)

Encoding Categorical Variables

In [72]:
X_train_cat = X_train.select_dtypes(exclude = ["int64", "float64"])
X_test_cat = X_test.select_dtypes(exclude = ["int64", "float64"])

In [73]:
X_train_cat.head()

Unnamed: 0,Name,Sex,Ticket,Embarked
538,"Risien, Mr. Samuel Beard",male,364498,S
48,"Samaan, Mr. Youssef",male,2662,C
101,"Petroff, Mr. Pastcho (""Pentcho"")",male,349215,S
450,"West, Mr. Edwy Arthur",male,C.A. 34651,S
833,"Augustsson, Mr. Albert",male,347468,S


In [74]:
X_train_cat.dtypes

Name        object
Sex         object
Ticket      object
Embarked    object
dtype: object

In [75]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output = False,
                   drop = "if_binary",
                   handle_unknown = 'ignore').set_output(transform="pandas")
ohe.fit(X_train_cat)


In [76]:
X_train_cat_encoded = ohe.transform(X_train_cat)
X_test_cat_encoded = ohe.transform(X_test_cat)



## One Shot

In [77]:
scalers = ColumnTransformer([
    ("standard_scaler", StandardScaler(), standard_features),
    ("robust_scaler", RobustScaler(), robust_features),  
    ("minmax_scaler", MinMaxScaler(), minmax_features),      
]).set_output(transform='pandas')
scalers

In [78]:
ohe = OneHotEncoder(sparse= False,
                   drop = "if_binary",
                   handle_unknown = 'ignore').set_output(transform='pandas')

In [79]:
from sklearn.compose import make_column_selector

In [81]:
preprocessor = ColumnTransformer([
    ("scalers", scalers, make_column_selector(dtype_include = ["int64", "float64"])),
    ("encoder", ohe, ['Sex', 'Embarked'])
]).set_output(transform='pandas')

preprocessor

In [82]:
preprocessor.fit(X_train)



In [83]:
X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [84]:
X_train_preprocessed.shape

(668, 9)

In [85]:
train_df.shape

(891, 11)

In [None]:
import sklearn
print(sklearn.__version__)

1.2.2
