# <center> **Kaggle’s Spaceship Titanic Competition**
# <center> **Feature Engineering**

# **Libraries**

In [1]:
import pandas as pd
import numpy as np

import functions
import importlib

importlib.reload(functions)

<module 'functions' from 'c:\\Users\\Dell\\Documents\\AI\\Titanic\\Notebooks\\functions.py'>

# **Load Data**

In [730]:
train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\train.csv",
    index_col=False
)

test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\test.csv",
    index_col=False
)

random_state = 101
target = 'Transported'

In [731]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## **Display Features**

In [732]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

# **Feature Engineering**

## **Combine Train and Test Data**

In [733]:
X = train.drop('Transported', axis=1).copy()
y = train['Transported'].copy().astype(int)

# Combine train and test data. Drops the original index and resets it in the new dataframe
data = pd.concat([X, test], axis=0).reset_index (drop=True)


In [734]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [735]:
print('Combined Dataset Shape:', data.shape)

Combined Dataset Shape: (12970, 13)


## **PassengerId**

Extract Group and Group Size from Passenger ID

In [736]:
data['Group'] = data['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
group_sizes = data['Group'].value_counts()
data['GroupSize'] = data['Group'].map(group_sizes)

In [737]:
column = data.pop('Group')
data.insert(1, 'Group', column)

column = data.pop('GroupSize') 
data.insert(2, 'GroupSize', column)    

## **Singles Traveler**

In [738]:
data['SingleTraveler'] = data['GroupSize'].apply(lambda x: 1 if x == 1 else 0)

In [739]:
column = data.pop('SingleTraveler')
data.insert(3, 'SingleTraveler', column)

In [740]:
singles = data[data['SingleTraveler'] == 1]
singles.shape

(7145, 16)

## **Cabin**

In [741]:
data[['CabinDeck', 'CabinNumber', 'CabinSide']] = data['Cabin'].str.split('/', expand=True)

In [742]:
column = data.pop('CabinDeck')
data.insert(5, 'CabinDeck', column)

column = data.pop('CabinNumber') 
data.insert(6, 'CabinNumber', column)   

column = data.pop('CabinSide')
data.insert(7, 'CabinSide', column) 

data.drop('Cabin', axis=1, inplace=True)

## **Name**

In [743]:
data[['FirstName', 'LastName']] = data['Name'].str.split(' ', expand=True)

In [744]:
data.drop('Name', axis=1, inplace=True)
data.drop('FirstName', axis=1, inplace=True)

## **Total Spent**

In [745]:
data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
data['TotalSpent'] =  data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

In [746]:
column = data.pop('TotalSpent')
data.insert(17, 'TotalSpent', column)

## **Spent (True/False)**

In [747]:
data['Spent'] = data['TotalSpent'].apply(lambda x: 1 if x > 0 else 0)

In [748]:
column = data.pop('Spent')
data.insert(18, 'Spent', column)

## **Age Groups**

In [749]:
data['AgeGroup']=np.nan
data.loc[data['Age']<=10,'AgeGroup']='0-10'
data.loc[(data['Age']>10) & (data['Age']<=20),'AgeGroup']='11-20'
data.loc[(data['Age']>20) & (data['Age']<=30),'AgeGroup']='21-30'
data.loc[(data['Age']>30) & (data['Age']<=40),'AgeGroup']='31-40'
data.loc[(data['Age']>40) & (data['Age']<=50),'AgeGroup']='41-50'
data.loc[(data['Age']>50) & (data['Age']<=60),'AgeGroup']='51-60'
data.loc[(data['Age']>60) & (data['Age']<=70),'AgeGroup']='61-70'
data.loc[(data['Age']>70) & (data['Age']<=80),'AgeGroup']='71-80'

In [750]:
column = data.pop('AgeGroup')
data.insert(11, 'AgeGroup', column)

## **Family Size**

In [751]:
data['LastName'].fillna('Unknown', inplace=True)

family_sizes = data['LastName'].value_counts()
data['FamilySize'] = data['LastName'].map(family_sizes)

## **Lone Traveler**

In [752]:
data['LoneTraveler'] = data['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

lone_traveler = data[data['LoneTraveler'] == 1]
lone_traveler.shape

(208, 23)

## **Single Traveler and Lone Traveler**

Single Traveler but not Lone Traveler

In [753]:
filtered_rows = data[(data['SingleTraveler'] == 1) & (data['LoneTraveler'] == 0)]
filtered_rows.shape

(6962, 23)

Lone Traveler but not Single Traveler

In [754]:
filtered_rows = data[(data['SingleTraveler'] == 0) & (data['LoneTraveler'] == 1)]
filtered_rows.shape


(25, 23)

## **Feature Engineered Dataset**

In [755]:
data.head()

Unnamed: 0,PassengerId,Group,GroupSize,SingleTraveler,HomePlanet,CabinDeck,CabinNumber,CabinSide,CryoSleep,Destination,Age,AgeGroup,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpent,Spent,LastName,FamilySize,LoneTraveler
0,0001_01,1,1,1,Europa,B,0,P,False,TRAPPIST-1e,39.0,31-40,False,0.0,0.0,0.0,0.0,0.0,0.0,0,Ofracculy,3,0
1,0002_01,2,1,1,Earth,F,0,S,False,TRAPPIST-1e,24.0,21-30,False,109.0,9.0,25.0,549.0,44.0,736.0,1,Vines,4,0
2,0003_01,3,2,0,Europa,A,0,S,False,TRAPPIST-1e,58.0,51-60,True,43.0,3576.0,0.0,6715.0,49.0,10383.0,1,Susent,7,0
3,0003_02,3,2,0,Europa,A,0,S,False,TRAPPIST-1e,33.0,31-40,False,0.0,1283.0,371.0,3329.0,193.0,5176.0,1,Susent,7,0
4,0004_01,4,1,1,Earth,F,1,S,False,TRAPPIST-1e,16.0,11-20,False,303.0,70.0,151.0,565.0,2.0,1091.0,1,Santantines,9,0


## **Save Dataset**

In [756]:
data.to_csv(r"C:\Users\Dell\Documents\AI\Titanic\Data\data.csv", index=False)