# <center> **Titanic**

# **Libraries**

In [120]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt


import functions
import importlib

import warnings

importlib.reload(functions)

<module 'functions' from 'c:\\Users\\Dell\\Documents\\AI\\Titanic\\Notebooks\\functions.py'>

# **Data Overview and Preprocessing**

In [121]:
data = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\data.csv",
    index_col=False
)

random_state = 101
target = 'Transported'

### **Missing Values**

In [122]:
missing_values = functions.MissingValues(data)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,288,2.22
CabinDeck,299,2.31
CabinNumber,299,2.31
CabinSide,299,2.31
CryoSleep,310,2.39
Destination,274,2.11
Age,270,2.08
AgeGroup,270,2.08
VIP,296,2.28


## **Home Planet**

### **Impute Home Planet from Group**

In [123]:
unique_homeplanets_per_group = data.groupby('Group')['HomePlanet'].nunique()
groups_with_multiple_planets = unique_homeplanets_per_group[unique_homeplanets_per_group > 1]

print(groups_with_multiple_planets)

Series([], Name: HomePlanet, dtype: int64)


In [124]:
# Step 1: Group by 'Group' and find the most common (or only) HomePlanet
group_homeplanet_map = data.groupby('Group')['HomePlanet'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

# Step 2: Define a function to fill missing HomePlanet based on the group's identified HomePlanet
def fill_missing_homeplanet(row):
    if pd.isna(row['HomePlanet']):
        return group_homeplanet_map.get(row['Group'], row['HomePlanet'])
    else:
        return row['HomePlanet']

# Step 3: Apply the function to fill missing values in the HomePlanet column
data['HomePlanet'] = data.apply(fill_missing_homeplanet, axis=1)

In [125]:
missing_values = functions.MissingValues(data)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,157,1.21
CabinDeck,299,2.31
CabinNumber,299,2.31
CabinSide,299,2.31
CryoSleep,310,2.39
Destination,274,2.11
Age,270,2.08
AgeGroup,270,2.08
VIP,296,2.28


### **Impute Home Planet from Last Name**

In [126]:
# Step 1: Group by 'LastName' and find the most common (or only) HomePlanet for each LastName
lastname_homeplanet_map = data.groupby('LastName')['HomePlanet'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

# Step 2: Define a function to fill missing HomePlanet based on the LastName's identified HomePlanet
def fill_missing_homeplanet_by_lastname(row):
    if pd.isna(row['HomePlanet']):
        return lastname_homeplanet_map.get(row['LastName'], row['HomePlanet'])
    else:
        return row['HomePlanet']

# Step 3: Apply the function to fill missing values in the HomePlanet column
data['HomePlanet'] = data.apply(fill_missing_homeplanet_by_lastname, axis=1)

In [127]:
missing_values = functions.MissingValues(data)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,7,0.05
CabinDeck,299,2.31
CabinNumber,299,2.31
CabinSide,299,2.31
CryoSleep,310,2.39
Destination,274,2.11
Age,270,2.08
AgeGroup,270,2.08
VIP,296,2.28


### **Impute Home Planet from Mode**

In [128]:
homeplanet_mode = data['HomePlanet'].mode()[0]
print(homeplanet_mode)

Earth


In [129]:
mode_value = data['HomePlanet'].mode()

if not mode_value.empty:
    data['HomePlanet'] = data['HomePlanet'].fillna(mode_value[0])

In [130]:
missing_values = functions.MissingValues(data)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
CabinDeck,299,2.31
CabinNumber,299,2.31
CabinSide,299,2.31
CryoSleep,310,2.39
Destination,274,2.11
Age,270,2.08
AgeGroup,270,2.08
VIP,296,2.28


## **Destination**

### **Impute Destination from Group**

In [131]:
unique_destinationplanets_per_group = data.groupby('Group')['Destination'].nunique()
groups_with_multiple_destinations = unique_destinationplanets_per_group[unique_destinationplanets_per_group > 1]

groups_with_multiple_destinations.head()

Group
8     2
17    2
20    3
32    2
44    3
Name: Destination, dtype: int64

In [132]:
# Step 1: Group by 'Group' and find the most common (or only) HomePlanet
group_destinationplanet_map = data.groupby('Group')['Destination'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

# Step 2: Define a function to fill missing HomePlanet based on the group's identified HomePlanet
def fill_missing_destinationplanet(row):
    if pd.isna(row['Destination']):
        return group_destinationplanet_map.get(row['Group'], row['Destination'])
    else:
        return row['Destination']

# Step 3: Apply the function to fill missing values in the HomePlanet column
data['Destination'] = data.apply(fill_missing_destinationplanet, axis=1)

In [133]:
missing_values = functions.MissingValues(data)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
CabinDeck,299,2.31
CabinNumber,299,2.31
CabinSide,299,2.31
CryoSleep,310,2.39
Destination,154,1.19
Age,270,2.08
AgeGroup,270,2.08
VIP,296,2.28


### **Impute Destination from Last Name**

In [134]:
lastname_destinationplanet_map = data.groupby('LastName')['Destination'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

def fill_missing_destinationplanet_by_lastname(row):
    if pd.isna(row['Destination']):
        return lastname_destinationplanet_map.get(row['LastName'], row['Destination'])
    else:
        return row['Destination']

data['Destination'] = data.apply(fill_missing_destinationplanet_by_lastname, axis=1)

In [135]:
missing_values = functions.MissingValues(data)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
CabinDeck,299,2.31
CabinNumber,299,2.31
CabinSide,299,2.31
CryoSleep,310,2.39
Destination,4,0.03
Age,270,2.08
AgeGroup,270,2.08
VIP,296,2.28


### **Impute Destination from Mode**

In [136]:
desinationplanet_mode = data['Destination'].mode()[0]
print(desinationplanet_mode)

TRAPPIST-1e


In [137]:
data['Destination'] = data['Destination'].fillna(data['Destination'].mode()[0])

In [138]:
missing_values = functions.MissingValues(data)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
CabinDeck,299,2.31
CabinNumber,299,2.31
CabinSide,299,2.31
CryoSleep,310,2.39
Age,270,2.08
AgeGroup,270,2.08
VIP,296,2.28


## **Cabin Deck**

### **Cabin Deck and Home Planet**

In [139]:
data.pivot_table(index='CabinDeck', columns='HomePlanet', aggfunc='size', fill_value=0)

HomePlanet,Earth,Europa,Mars
CabinDeck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,353,0
B,0,1141,0
C,1,1101,0
D,3,306,411
E,597,203,523
F,2480,0,1759
G,3781,0,0
T,0,11,0


Passengers on decks A (Except 1 passenger), B, C (Except 1 passenger), T came from Europa </BR>
Passengers on deck G came from Earth.</BR>
Passengers on decks D, E or F came from multiple planets.</BR>

### **Impute CabinDeck G from Earth**

For train set, we went down from 199 to 100. For test set, we went from 100 to 38.

In [140]:
# Fill missing CabinDeck values with 'G' where HomePlanet is 'Earth'
data.loc[(data['HomePlanet'] == 'Earth') & (data['CabinDeck'].isnull()), 'CabinDeck'] = 'G'

In [141]:
functions.MissingValues(data)

Unnamed: 0,NumberMissing,PercentageMissing
CabinDeck,138,1.06
CabinNumber,299,2.31
CabinSide,299,2.31
CryoSleep,310,2.39
Age,270,2.08
AgeGroup,270,2.08
VIP,296,2.28


### **Impute CabinDeck from Last Name**

In [142]:
lastname_cabindeck_map = data.groupby('LastName')['CabinDeck'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

def fill_missing_cabindeck_by_lastname(row):
    if pd.isna(row['CabinDeck']):
        return lastname_homeplanet_map.get(row['LastName'], row['CabinDeck'])
    else:
        return row['CabinDeck']

data['CabinDeck'] = data.apply(fill_missing_cabindeck_by_lastname, axis=1)

In [143]:
missing_values = functions.MissingValues(data)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
CabinNumber,299,2.31
CabinSide,299,2.31
CryoSleep,310,2.39
Age,270,2.08
AgeGroup,270,2.08
VIP,296,2.28
