# <center> **Titanic**

# **Libraries**

In [168]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt


import functions
import importlib

import warnings

importlib.reload(functions)

<module 'functions' from 'c:\\Users\\Dell\\Documents\\AI\\Titanic\\Notebooks\\functions.py'>

# **Data Overview and Preprocessing**

In [169]:
train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\train.csv",
    index_col=False
)

test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\test.csv",
    index_col=False
)

random_state = 101
target = 'Transported'

## **Missing Data**

### **Train Set Missing Values**

In [170]:
missing_values = functions.MissingValues(train)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,201,2.31
CryoSleep,217,2.5
Cabin,199,2.29
Destination,182,2.09
Age,179,2.06
VIP,203,2.34
RoomService,181,2.08
FoodCourt,183,2.11
ShoppingMall,208,2.39
Spa,183,2.11


### **Test Set Missing Values**

In [171]:
missing_values = functions.MissingValues(test)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,87,2.03
CryoSleep,93,2.17
Cabin,100,2.34
Destination,92,2.15
Age,91,2.13
VIP,93,2.17
RoomService,82,1.92
FoodCourt,106,2.48
ShoppingMall,98,2.29
Spa,101,2.36


# **Information Extraction**

## **PassengerId**

In [172]:
train['Group'] = train['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
test['Group'] = test['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)

In [173]:
column = train.pop('Group')
train.insert(1, 'Group', column)

column = test.pop('Group')
test.insert(1, 'Group', column)

## **Name**

In [174]:
train[['FirstName', 'LastName']] = train['Name'].str.split(' ', expand=True)
test[['FirstName', 'LastName']] = test['Name'].str.split(' ', expand=True)

In [175]:
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

## **Cabin**

In [None]:
train[['CabinDeck', 'CabinNumber', 'CabinSide']] = train['Cabin'].str.split('/', expand=True)
test[['CabinDeck', 'CabinNumber', 'CabinSide']] = test['Cabin'].str.split('/', expand=True)

In [None]:
column = train.pop('CabinDeck')
train.insert(5, 'CabinDeck', column)
column = test.pop('CabinDeck')
test.insert(5, 'CabinDeck', column)

column = train.pop('CabinNumber') 
train.insert(6, 'CabinNumber', column)  
column = test.pop('CabinNumber') 
test.insert(6, 'CabinNumber', column)    

column = test.pop('CabinSide')  
test.insert(7, 'CabinSide', column)
column = train.pop('CabinSide') 
train.insert(7, 'CabinSide', column)    

train.drop('Cabin', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)

## **Fill Home Planet Missing Values with Group Number Information**

In [176]:
num_unique_groups = train['Group'].nunique()
print(num_unique_groups)

6217


In [177]:
unique_homeplanets_per_group = train.groupby('Group')['HomePlanet'].nunique()
groups_with_multiple_planets = unique_homeplanets_per_group[unique_homeplanets_per_group > 1]

print(groups_with_multiple_planets)

Series([], Name: HomePlanet, dtype: int64)


In [178]:
# Step 1: Group by 'Group' and find the most common (or only) HomePlanet
group_homeplanet_map = train.groupby('Group')['HomePlanet'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

# Step 2: Define a function to fill missing HomePlanet based on the group's identified HomePlanet
def fill_missing_homeplanet(row):
    if pd.isna(row['HomePlanet']):
        return group_homeplanet_map.get(row['Group'], row['HomePlanet'])
    else:
        return row['HomePlanet']

# Step 3: Apply the function to fill missing values in the HomePlanet column
train['HomePlanet'] = train.apply(fill_missing_homeplanet, axis=1)

In [179]:
# Step 1: Group by 'Group' and find the most common (or only) HomePlanet
group_homeplanet_map = test.groupby('Group')['HomePlanet'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

# Step 2: Define a function to fill missing HomePlanet based on the group's identified HomePlanet
def fill_missing_homeplanet(row):
    if pd.isna(row['HomePlanet']):
        return group_homeplanet_map.get(row['Group'], row['HomePlanet'])
    else:
        return row['HomePlanet']

# Step 3: Apply the function to fill missing values in the HomePlanet column
test['HomePlanet'] = test.apply(fill_missing_homeplanet, axis=1)

In [180]:
missing_values = functions.MissingValues(train)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,111,1.28
CryoSleep,217,2.5
Cabin,199,2.29
Destination,182,2.09
Age,179,2.06
VIP,203,2.34
RoomService,181,2.08
FoodCourt,183,2.11
ShoppingMall,208,2.39
Spa,183,2.11


In [181]:
missing_values = functions.MissingValues(test)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,46,1.08
CryoSleep,93,2.17
Cabin,100,2.34
Destination,92,2.15
Age,91,2.13
VIP,93,2.17
RoomService,82,1.92
FoodCourt,106,2.48
ShoppingMall,98,2.29
Spa,101,2.36


## **Fill Home Planet Missing Values with LastName Information**

In [182]:
# Step 1: Group by 'LastName' and find the most common (or only) HomePlanet for each LastName
lastname_homeplanet_map = train.groupby('LastName')['HomePlanet'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

# Step 2: Define a function to fill missing HomePlanet based on the LastName's identified HomePlanet
def fill_missing_homeplanet_by_lastname(row):
    if pd.isna(row['HomePlanet']):
        return lastname_homeplanet_map.get(row['LastName'], row['HomePlanet'])
    else:
        return row['HomePlanet']

# Step 3: Apply the function to fill missing values in the HomePlanet column
train['HomePlanet'] = train.apply(fill_missing_homeplanet_by_lastname, axis=1)

In [183]:
missing_values = functions.MissingValues(train)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,12,0.14
CryoSleep,217,2.5
Cabin,199,2.29
Destination,182,2.09
Age,179,2.06
VIP,203,2.34
RoomService,181,2.08
FoodCourt,183,2.11
ShoppingMall,208,2.39
Spa,183,2.11


In [184]:
# Step 1: Group by 'LastName' and find the most common (or only) HomePlanet for each LastName
lastname_homeplanet_map = test.groupby('LastName')['HomePlanet'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

# Step 2: Define a function to fill missing HomePlanet based on the LastName's identified HomePlanet
def fill_missing_homeplanet_by_lastname(row):
    if pd.isna(row['HomePlanet']):
        return lastname_homeplanet_map.get(row['LastName'], row['HomePlanet'])
    else:
        return row['HomePlanet']

# Step 3: Apply the function to fill missing values in the HomePlanet column
test['HomePlanet'] = test.apply(fill_missing_homeplanet_by_lastname, axis=1)

In [185]:
missing_values = functions.MissingValues(test)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,16,0.37
CryoSleep,93,2.17
Cabin,100,2.34
Destination,92,2.15
Age,91,2.13
VIP,93,2.17
RoomService,82,1.92
FoodCourt,106,2.48
ShoppingMall,98,2.29
Spa,101,2.36


## **Fill Home Planet Missing Values with Mode**

In [186]:
# Get the mode (most frequent value) of 'HomePlanet'
homeplanet_mode = train['HomePlanet'].mode()[0]
print(homeplanet_mode)

Earth


In [187]:
train['HomePlanet'] = train['HomePlanet'].fillna(homeplanet_mode)
test['HomePlanet'] = test['HomePlanet'].fillna(homeplanet_mode)

In [188]:
missing_values = functions.MissingValues(train)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
CryoSleep,217,2.5
Cabin,199,2.29
Destination,182,2.09
Age,179,2.06
VIP,203,2.34
RoomService,181,2.08
FoodCourt,183,2.11
ShoppingMall,208,2.39
Spa,183,2.11
VRDeck,188,2.16


In [189]:
missing_values = functions.MissingValues(test)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
CryoSleep,93,2.17
Cabin,100,2.34
Destination,92,2.15
Age,91,2.13
VIP,93,2.17
RoomService,82,1.92
FoodCourt,106,2.48
ShoppingMall,98,2.29
Spa,101,2.36
VRDeck,80,1.87


# **Cabin Deck Imputation**

In [192]:
train.pivot_table(index='CabinDeck', columns='HomePlanet', aggfunc='size', fill_value=0)

HomePlanet,Earth,Europa,Mars
CabinDeck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,255,0
B,0,779,0
C,1,746,0
D,1,192,285
E,404,133,339
F,1652,0,1142
G,2559,0,0
T,0,5,0


In [193]:
test.pivot_table(index='CabinDeck', columns='HomePlanet', aggfunc='size', fill_value=0)

HomePlanet,Earth,Europa,Mars
CabinDeck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,98,0
B,3,359,0
C,1,354,0
D,2,114,126
E,194,69,184
F,832,0,613
G,1222,0,0
T,0,6,0


Passengers on decks A, B, C, T came from Europa </BR>
Passengers on deck G came from Earth.</BR>
Passengers on decks D, E or F came from multiple planets.</BR>

### **Passengers from Earth on CabinDeck G**

For train set, we went down from 199 to 100. For test set, we went from 100 to 38.

In [194]:
# Fill missing CabinDeck values with 'G' where HomePlanet is 'Earth'
train.loc[(train['HomePlanet'] == 'Earth') & (train['CabinDeck'].isnull()), 'CabinDeck'] = 'G'
test.loc[(test['HomePlanet'] == 'Earth') & (test['CabinDeck'].isnull()), 'CabinDeck'] = 'G'

In [195]:
functions.MissingValues(train)

Unnamed: 0,NumberMissing,PercentageMissing
CryoSleep,217,2.5
CabinDeck,100,1.15
CabinNumber,199,2.29
CabinSide,199,2.29
Destination,182,2.09
Age,179,2.06
VIP,203,2.34
RoomService,181,2.08
FoodCourt,183,2.11
ShoppingMall,208,2.39


In [196]:
functions.MissingValues(test)

Unnamed: 0,NumberMissing,PercentageMissing
CryoSleep,93,2.17
CabinDeck,38,0.89
CabinNumber,100,2.34
CabinSide,100,2.34
Destination,92,2.15
Age,91,2.13
VIP,93,2.17
RoomService,82,1.92
FoodCourt,106,2.48
ShoppingMall,98,2.29


### **Fill CabinDeck Missing Values with LastName Information**

For train, we went from 100 to 52. For test, we went from 38 to 0.

In [197]:
lastname_cabindeck_map = train.groupby('LastName')['CabinDeck'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

def fill_missing_cabindeck_by_lastname(row):
    if pd.isna(row['CabinDeck']):
        return lastname_homeplanet_map.get(row['LastName'], row['CabinDeck'])
    else:
        return row['CabinDeck']

train['CabinDeck'] = train.apply(fill_missing_cabindeck_by_lastname, axis=1)

In [198]:
missing_values = functions.MissingValues(train)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
CryoSleep,217,2.5
CabinDeck,52,0.6
CabinNumber,199,2.29
CabinSide,199,2.29
Destination,182,2.09
Age,179,2.06
VIP,203,2.34
RoomService,181,2.08
FoodCourt,183,2.11
ShoppingMall,208,2.39


In [199]:
lastname_cabindeck_map = test.groupby('LastName')['CabinDeck'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

def fill_missing_cabindeck_by_lastname(row):
    if pd.isna(row['CabinDeck']):
        return lastname_homeplanet_map.get(row['LastName'], row['CabinDeck'])
    else:
        return row['CabinDeck']

test['CabinDeck'] = test.apply(fill_missing_cabindeck_by_lastname, axis=1)

In [200]:
functions.MissingValues(test)

Unnamed: 0,NumberMissing,PercentageMissing
CryoSleep,93,2.17
CabinNumber,100,2.34
CabinSide,100,2.34
Destination,92,2.15
Age,91,2.13
VIP,93,2.17
RoomService,82,1.92
FoodCourt,106,2.48
ShoppingMall,98,2.29
Spa,101,2.36


## **Fill CabinDeck Missing Values with Mode**

In [204]:
# Get the mode (most frequent value) of 'HomePlanet'
cabindeck_mode = train['CabinDeck'].mode()[0]
print(cabindeck_mode)

F


In [202]:
train['CabinDeck'] = train['CabinDeck'].fillna(homeplanet_mode)
test['CabinDeck'] = test['CabinDeck'].fillna(homeplanet_mode)

In [203]:
missing_values = functions.MissingValues(train)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
CryoSleep,217,2.5
CabinNumber,199,2.29
CabinSide,199,2.29
Destination,182,2.09
Age,179,2.06
VIP,203,2.34
RoomService,181,2.08
FoodCourt,183,2.11
ShoppingMall,208,2.39
Spa,183,2.11


In [167]:
missing_values = functions.MissingValues(test)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
CryoSleep,93,2.17
CabinNumber,100,2.34
CabinSide,100,2.34
Age,91,2.13
VIP,93,2.17
RoomService,82,1.92
FoodCourt,106,2.48
ShoppingMall,98,2.29
Spa,101,2.36
VRDeck,80,1.87
