# Modeling

Testing different classification models to see which is best for use

## Imports

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Load Data

In [10]:
path  = "../data/raw/Titanic-Dataset.csv"
df = pd.read_csv(path)

In [11]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Data Processing

In [13]:
df.set_index("PassengerId", inplace=True)

In [14]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [15]:
df["Age"].fillna(df["Age"].mean(), inplace=True)

In [16]:
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

In [17]:
cabin_p1 = []
cabin_p2 = []
cabin_p3 = []

# Find the first letter of the cabin for each passenger class
for i in df[df["Pclass"] == 1]["Cabin"]:
    if isinstance(i, str):
        cabin_p1.append(i[0])
for i in df[df["Pclass"] == 2]["Cabin"]:
    if isinstance(i, str):
        cabin_p2.append(i[0])
for i in df[df["Pclass"] == 3]["Cabin"]:
    if isinstance(i, str):
        cabin_p3.append(i[0])

cabin_p1 = set(cabin_p1)
cabin_p2 = set(cabin_p2)
cabin_p3 = set(cabin_p3)

def fill_cabin(row):
    num = np.random.randint(0, 99) 
    if pd.isnull(row["Cabin"]) or isinstance(row["Cabin"], float):
        if row["Pclass"] == 1:
            row["Cabin"] = np.random.choice(list(cabin_p1)) + str(num)
        elif row["Pclass"] == 2:
            row["Cabin"] = np.random.choice(list(cabin_p2)) + str(num)
        elif row["Pclass"] == 3:
            row["Cabin"] = np.random.choice(list(cabin_p3)) + str(num)
    return row

In [18]:
df = df.apply(fill_cabin, axis=1)

In [19]:
df["Cabin"] = df["Cabin"].apply(lambda x: x[0])

In [21]:
df.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

## Feature Engineering

In [22]:
df["Name"] = df["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())
df.rename(columns={"Name": "Title"}, inplace=True)

In [24]:
df["Family_Size"] = df["SibSp"] + df["Parch"] + 1

In [25]:
df["Is_Alone"] = df["Family_Size"].apply(lambda x: 1 if x == 1 else 0)

In [26]:
df["FarePerPerson"] = df["Fare"] / df["Family_Size"]

In [27]:
def age_group(age):
    if age < 12:
        return "Child"
    elif 12 <= age < 18:
        return "Teen"
    elif 18 <= age < 60:
        return "Adult"
    else:
        return "Senior"

In [28]:
df["Age_Group"] = df["Age"].apply(age_group)

In [30]:
df["Ticket_Frequency"] = df.groupby("Ticket")["Ticket"].transform("count")

In [31]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size,Is_Alone,FarePerPerson,Age_Group,Ticket_Frequency
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,Mr,male,22.0,1,0,A/5 21171,7.25,G,S,2,0,3.625,Adult,1
2,1,1,Mrs,female,38.0,1,0,PC 17599,71.2833,C,C,2,0,35.64165,Adult,1
3,1,3,Miss,female,26.0,0,0,STON/O2. 3101282,7.925,G,S,1,1,7.925,Adult,1
4,1,1,Mrs,female,35.0,1,0,113803,53.1,C,S,2,0,26.55,Adult,2
5,0,3,Mr,male,35.0,0,0,373450,8.05,E,S,1,1,8.05,Adult,1


### One Hot Encoding

In [36]:
temp = df.drop(columns = ["Ticket"])
model_df = pd.get_dummies(temp, columns=["Sex", "Embarked", "Pclass", "Title", "Cabin", "Age_Group"], drop_first=True, dtype=int)

In [37]:
model_df.head()

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Family_Size,Is_Alone,FarePerPerson,Ticket_Frequency,Sex_male,...,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Age_Group_Child,Age_Group_Senior,Age_Group_Teen
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,22.0,1,0,7.25,2,0,3.625,1,1,...,0,0,0,0,0,1,0,0,0,0
2,1,38.0,1,0,71.2833,2,0,35.64165,1,0,...,0,1,0,0,0,0,0,0,0,0
3,1,26.0,0,0,7.925,1,1,7.925,1,0,...,0,0,0,0,0,1,0,0,0,0
4,1,35.0,1,0,53.1,2,0,26.55,2,0,...,0,1,0,0,0,0,0,0,0,0
5,0,35.0,0,0,8.05,1,1,8.05,1,1,...,0,0,0,1,0,0,0,0,0,0


## Correlation

In [39]:
model_df.corr().style.background_gradient(cmap='coolwarm', axis=None).format(precision=2)

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Family_Size,Is_Alone,FarePerPerson,Ticket_Frequency,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,Title_Col,Title_Don,Title_Dr,Title_Jonkheer,Title_Lady,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Title_the Countess,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Age_Group_Child,Age_Group_Senior,Age_Group_Teen
Survived,1.0,-0.07,-0.04,0.08,0.26,0.02,-0.2,0.22,0.04,-0.54,0.0,-0.15,0.09,-0.32,0.01,-0.03,0.01,-0.03,0.04,0.01,0.09,0.33,0.06,0.04,-0.55,0.34,0.04,-0.06,0.04,0.04,0.15,0.1,0.17,-0.05,-0.12,-0.13,-0.04,0.11,-0.04,0.05
Age,-0.07,1.0,-0.23,-0.18,0.09,-0.25,0.18,0.14,-0.22,0.08,-0.01,-0.02,0.01,-0.28,0.1,0.03,0.07,0.02,0.05,0.07,-0.38,-0.25,-0.02,-0.01,0.19,0.17,-0.0,0.09,0.05,0.01,0.11,0.13,0.11,-0.06,-0.11,-0.12,0.08,-0.55,0.47,-0.25
SibSp,-0.04,-0.23,1.0,0.41,0.16,0.89,-0.58,-0.09,0.66,-0.11,-0.03,0.07,-0.06,0.09,-0.02,-0.02,0.0,-0.02,0.01,-0.02,0.35,0.09,-0.02,-0.02,-0.25,0.06,-0.02,-0.03,0.01,-0.02,-0.04,0.03,-0.05,0.02,0.01,0.05,-0.04,0.35,-0.05,0.06
Parch,0.08,-0.18,0.41,1.0,0.22,0.78,-0.58,-0.07,0.59,-0.25,-0.08,0.06,-0.0,0.02,-0.02,-0.02,-0.04,-0.02,-0.02,-0.02,0.27,0.11,-0.02,-0.02,-0.33,0.23,-0.02,-0.02,-0.02,-0.02,0.05,0.01,-0.03,0.01,-0.0,0.0,-0.02,0.37,-0.02,0.03
Fare,0.26,0.09,0.16,0.22,1.0,0.22,-0.27,0.84,0.35,-0.18,-0.12,-0.16,-0.12,-0.41,-0.0,-0.0,0.03,-0.02,0.0,-0.0,0.01,0.12,0.03,0.03,-0.18,0.11,-0.01,-0.02,0.02,0.04,0.37,0.36,0.04,-0.14,-0.2,-0.18,-0.0,-0.0,0.04,-0.01
Family_Size,0.02,-0.25,0.89,0.78,0.22,1.0,-0.69,-0.1,0.75,-0.2,-0.06,0.08,-0.04,0.07,-0.03,-0.02,-0.02,-0.02,0.0,-0.03,0.37,0.11,-0.03,-0.02,-0.34,0.16,-0.02,-0.03,0.0,-0.02,-0.01,0.02,-0.05,0.02,0.0,0.03,-0.04,0.43,-0.04,0.06
Is_Alone,-0.2,0.18,-0.58,-0.58,-0.27,-0.69,1.0,0.05,-0.46,0.3,0.09,0.03,-0.04,0.13,0.04,0.03,0.02,0.03,-0.04,0.04,-0.27,-0.06,0.04,0.03,0.4,-0.37,0.03,0.01,-0.04,0.03,-0.05,-0.11,-0.06,-0.01,0.06,0.06,0.04,-0.34,0.05,-0.06
FarePerPerson,0.22,0.14,-0.09,-0.07,0.84,-0.1,0.05,1.0,0.17,-0.12,-0.1,-0.18,-0.1,-0.37,0.01,0.01,0.02,-0.02,-0.0,0.01,-0.07,0.11,0.05,0.05,-0.08,0.01,-0.01,-0.01,0.01,0.06,0.32,0.24,0.05,-0.12,-0.17,-0.16,0.01,-0.09,0.03,-0.03
Ticket_Frequency,0.04,-0.22,0.66,0.59,0.35,0.75,-0.46,0.17,1.0,-0.15,-0.07,0.07,-0.06,0.02,-0.03,-0.02,-0.03,-0.02,-0.02,-0.03,0.31,0.11,-0.01,0.01,-0.27,0.09,-0.02,-0.02,0.01,0.03,0.03,0.06,-0.02,-0.01,0.0,-0.01,-0.05,0.39,-0.06,0.05
Sex_male,-0.54,0.08,-0.11,-0.25,-0.18,-0.2,0.3,-0.12,-0.15,1.0,-0.07,0.12,-0.06,0.14,0.03,0.02,0.04,0.02,-0.05,0.03,0.16,-0.69,-0.06,-0.05,0.87,-0.55,-0.05,0.06,0.02,-0.05,-0.09,-0.04,-0.1,0.0,0.08,0.04,0.06,-0.07,0.07,-0.08
