# <center> **Titanic**

# **Libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt


import functions
import importlib

import warnings

importlib.reload(functions)

<module 'functions' from 'c:\\Users\\Dell\\Documents\\AI\\Titanic\\Notebooks\\functions.py'>

# **Data Overview and Preprocessing**

In [2]:
train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\train.csv",
    index_col=False
)

test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\test.csv",
    index_col=False
)

random_state = 101
target = 'Transported'

## **Missing Data**

### **Train Set Missing Values**

In [3]:
missing_values = functions.MissingValues(train)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,201,2.31
CryoSleep,217,2.5
Cabin,199,2.29
Destination,182,2.09
Age,179,2.06
VIP,203,2.34
RoomService,181,2.08
FoodCourt,183,2.11
ShoppingMall,208,2.39
Spa,183,2.11


### **Test Set Missing Values**

In [4]:
missing_values = functions.MissingValues(test)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,87,2.03
CryoSleep,93,2.17
Cabin,100,2.34
Destination,92,2.15
Age,91,2.13
VIP,93,2.17
RoomService,82,1.92
FoodCourt,106,2.48
ShoppingMall,98,2.29
Spa,101,2.36


# **Information Extraction**

## **PassengerId**

In [5]:
train['Group'] = train['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
test['Group'] = test['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)

In [6]:
column = train.pop('Group')
train.insert(1, 'Group', column)

column = test.pop('Group')
test.insert(1, 'Group', column)

## **Name**

In [7]:
train[['FirstName', 'LastName']] = train['Name'].str.split(' ', expand=True)
test[['FirstName', 'LastName']] = test['Name'].str.split(' ', expand=True)

In [8]:
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

## **Fill Destination Missing Values with Group Number Information**

In [9]:
num_unique_groups = train['Group'].nunique()
print(num_unique_groups)

6217


In [10]:
unique_destinationplanets_per_group = train.groupby('Group')['HomePlanet'].nunique()
groups_with_multiple_planets = unique_destinationplanets_per_group[unique_destinationplanets_per_group > 1]

print(groups_with_multiple_planets)

Series([], Name: HomePlanet, dtype: int64)


In [12]:
# Step 1: Group by 'Group' and find the most common (or only) HomePlanet
group_destinationplanet_map = train.groupby('Group')['Destination'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

# Step 2: Define a function to fill missing HomePlanet based on the group's identified HomePlanet
def fill_missing_destinationplanet(row):
    if pd.isna(row['Destination']):
        return group_destinationplanet_map.get(row['Group'], row['Destination'])
    else:
        return row['Destination']

# Step 3: Apply the function to fill missing values in the HomePlanet column
train['Destination'] = train.apply(fill_missing_destinationplanet, axis=1)

In [13]:
missing_values = functions.MissingValues(train)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,201,2.31
CryoSleep,217,2.5
Cabin,199,2.29
Destination,103,1.18
Age,179,2.06
VIP,203,2.34
RoomService,181,2.08
FoodCourt,183,2.11
ShoppingMall,208,2.39
Spa,183,2.11


In [14]:
group_destinationplanet_map = test.groupby('Group')['Destination'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

def fill_missing_destinationplanet(row):
    if pd.isna(row['Destination']):
        return group_destinationplanet_map.get(row['Group'], row['Destination'])
    else:
        return row['Destination']

test['Destination'] = test.apply(fill_missing_destinationplanet, axis=1)

In [15]:
missing_values = functions.MissingValues(test)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,87,2.03
CryoSleep,93,2.17
Cabin,100,2.34
Destination,51,1.19
Age,91,2.13
VIP,93,2.17
RoomService,82,1.92
FoodCourt,106,2.48
ShoppingMall,98,2.29
Spa,101,2.36


## **Fill Destination Missing Values with LastName Information**

In [16]:
lastname_destinationplanet_map = train.groupby('LastName')['Destination'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

def fill_missing_destinationplanet_by_lastname(row):
    if pd.isna(row['Destination']):
        return lastname_destinationplanet_map.get(row['LastName'], row['Destination'])
    else:
        return row['Destination']

train['Destination'] = train.apply(fill_missing_destinationplanet_by_lastname, axis=1)

In [17]:
missing_values = functions.MissingValues(train)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,201,2.31
CryoSleep,217,2.5
Cabin,199,2.29
Destination,9,0.1
Age,179,2.06
VIP,203,2.34
RoomService,181,2.08
FoodCourt,183,2.11
ShoppingMall,208,2.39
Spa,183,2.11


In [18]:
lastname_destinationplanet_map = test.groupby('LastName')['Destination'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)

def fill_missing_destinationplanet_by_lastname(row):
    if pd.isna(row['Destination']):
        return lastname_destinationplanet_map.get(row['LastName'], row['Destination'])
    else:
        return row['Destination']

test['Destination'] = test.apply(fill_missing_destinationplanet_by_lastname, axis=1)

In [19]:
missing_values = functions.MissingValues(test)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,87,2.03
CryoSleep,93,2.17
Cabin,100,2.34
Destination,14,0.33
Age,91,2.13
VIP,93,2.17
RoomService,82,1.92
FoodCourt,106,2.48
ShoppingMall,98,2.29
Spa,101,2.36


## **Fill Home Planet Missing Values with Mode**

In [22]:
desinationplanet_mode = train['Destination'].mode()[0]
print(desinationplanet_mode)

TRAPPIST-1e


In [24]:
train['Destination'] = train['Destination'].fillna(train['Destination'].mode()[0])
test['Destination'] = test['Destination'].fillna(train['Destination'].mode()[0])

In [25]:
missing_values = functions.MissingValues(train)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,201,2.31
CryoSleep,217,2.5
Cabin,199,2.29
Age,179,2.06
VIP,203,2.34
RoomService,181,2.08
FoodCourt,183,2.11
ShoppingMall,208,2.39
Spa,183,2.11
VRDeck,188,2.16


In [26]:
missing_values = functions.MissingValues(test)
missing_values

Unnamed: 0,NumberMissing,PercentageMissing
HomePlanet,87,2.03
CryoSleep,93,2.17
Cabin,100,2.34
Age,91,2.13
VIP,93,2.17
RoomService,82,1.92
FoodCourt,106,2.48
ShoppingMall,98,2.29
Spa,101,2.36
VRDeck,80,1.87
