# <center> **Kaggle’s Spaceship Titanic Competition**
# <center> **Feature Engineering**

# **Libraries**

In [122]:
import pandas as pd
import numpy as np
import warnings

# **Load Data**

In [123]:
train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\train.csv",
    index_col=False
)

test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\test.csv",
    index_col=False
)

random_state = 101
target = 'Transported'

## **Display Features**

In [124]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

# **Combine Train and Test Data**

In [125]:
data = pd.concat([train, test], axis=0).reset_index (drop=True)

# **Extract Information from PassengerId**

## **Group and Group Size**

In [126]:
data['Group'] = data['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
group_sizes = data['Group'].value_counts()
data['GroupSize'] = data['Group'].map(group_sizes)

In [127]:
column = data.pop('Group')
data.insert(1, 'Group', column)

column = data.pop('GroupSize') 
data.insert(2, 'GroupSize', column)    

## **Solo Traveler**

In [128]:
data['SoloTraveler'] = data['GroupSize'].apply(lambda x: 1 if x == 1 else 0)

In [129]:
column = data.pop('SoloTraveler')
data.insert(3, 'SoloTraveler', column)

### **Insights**

> * **Group:** Extracted Group Number from PassengerId.
> * **Group Size:** Extracted Group Size from PassengerId.
> * **Solo Traveler:** Identified those passengers who belong to GroupSize == 1 and Solo Travelers.

# **Extract Information from Cabin**

In [130]:
data[['CabinDeck', 'CabinNumber', 'CabinSide']] = data['Cabin'].str.split('/', expand=True)

In [131]:
column = data.pop('CabinDeck')
data.insert(5, 'CabinDeck', column)

column = data.pop('CabinNumber') 
data.insert(6, 'CabinNumber', column)   

column = data.pop('CabinSide')
data.insert(7, 'CabinSide', column) 

data.drop('Cabin', axis=1, inplace=True)

### **Insights**

> * **Cabin Deck:** Extracted CabinDeck from Cabin.
> * **Cabin Number:** Extracted CabinNumber from Cabin.
> * **Cabin Side:** Extracted CabinSide from Cabin.

# **Extract Information from Name**

## **Last Name**

In [132]:
data[['FirstName', 'LastName']] = data['Name'].str.split(' ', expand=True)

In [133]:
data.drop('Name', axis=1, inplace=True)
data.drop('FirstName', axis=1, inplace=True)

## **Family Size**

In [134]:
data['LastName'].fillna('Unknown', inplace=True)

family_sizes = data['LastName'].value_counts()
data['FamilySize'] = data['LastName'].map(family_sizes)

## **Lone Traveler**

In [135]:
data['LoneTraveler'] = data['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

lone_traveler = data[data['LoneTraveler'] == 1]
lone_traveler.shape[0]

208

### **Insights**

> * **Last Name:** Extracted LastName from Name.
> * **Family Size:** Extracted FamilySize from LastName.
> * **Lone Traveler:** Extracted Lone Traveler from LastName.

## **Extract Information from Luxury Item Expenditure**

## **Total Expenditure**

In [136]:
data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
data['TotalExpenditure'] =  data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

In [137]:
column = data.pop('TotalExpenditure')
data.insert(17, 'TotalExpenditure', column)

## **Expenditure (True/False)**

In [138]:
data['Expenditure'] = data['TotalExpenditure'].apply(lambda x: 1 if x > 0 else 0)

In [139]:
column = data.pop('Expenditure')
data.insert(18, 'Expenditure', column)

### **Insights**

> * **Total Expenditure:** Extracted TotalExpenditure from luxury item spending amounts.
> * **Expenditure True or False:** Extracted Expenditure True or False from Total Expenditure.

# **Age Groups**

In [140]:
data['Age'] = data['Age'].fillna(data['Age'].median()).astype('float64')

In [141]:
data['AgeGroup']=np.nan
data.loc[data['Age']<=10,'AgeGroup']='0-10'
data.loc[(data['Age']>10) & (data['Age']<=20),'AgeGroup']='11-20'
data.loc[(data['Age']>20) & (data['Age']<=30),'AgeGroup']='21-30'
data.loc[(data['Age']>30) & (data['Age']<=40),'AgeGroup']='31-40'
data.loc[(data['Age']>40) & (data['Age']<=50),'AgeGroup']='41-50'
data.loc[(data['Age']>50) & (data['Age']<=60),'AgeGroup']='51-60'
data.loc[(data['Age']>60) & (data['Age']<=70),'AgeGroup']='61-70'
data.loc[(data['Age']>70) & (data['Age']<=80),'AgeGroup']='71-80'

In [142]:
column = data.pop('AgeGroup')
data.insert(11, 'AgeGroup', column)

### **Insights**

> * **Age Groups** Divided the Age data into 8 age ranges.

# **Feature Engineered Dataset**

In [143]:
column = data.pop('Transported')
data.insert(23, 'Transported', column)

In [145]:
data.to_csv(r"C:\Users\Dell\Documents\AI\Titanic\Data\data.csv", index=False)