<a href="https://colab.research.google.com/github/MStamirski/Spaceship-Titanic/blob/main/FeaturesEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install category_encoders

In [None]:
import pandas as pd
import numpy as np
import category_encoders as ce

In [None]:
def read_data(filename):
  df = pd.read_csv(filename+".csv")
  return df

In [None]:
def data_cleaning(df):

  df['HomePlanet'].fillna('Unknown', inplace=True)

  df['CryoSleep']=df['CryoSleep'].apply(lambda x: 'True' if (x and not pd.isna(x)) else x)
  df['CryoSleep']=df['CryoSleep'].apply(lambda x: 'False' if not x else x)
  df['CryoSleep'].fillna('Unknown', inplace=True)

  df['Cabin'].fillna('0/000/0', inplace=True)

  df['Destination'].fillna('Unknown', inplace=True)

  df['Age'].fillna(24, inplace=True)   # mode = 24
  df['Age']=df['Age'].apply(lambda x: 24 if x==0 else x)

  df['VIP']=df['VIP'].apply(lambda x: 'True' if (x and not pd.isna(x)) else x)
  df['VIP']=df['VIP'].apply(lambda x: 'False' if not x else x)
  df['VIP'].fillna('Unknown', inplace=True)

  df['RoomService'].fillna(0, inplace=True)

  df['FoodCourt'].fillna(0, inplace=True)

  df['ShoppingMall'].fillna(0, inplace=True)

  df['Spa'].fillna(0, inplace=True)

  df['VRDeck'].fillna(0, inplace=True)

  return df

In [None]:
def calculate_deciles(col):
  return [np.quantile(col,q/100) for q in range(10, 101, 10)]

def get_decile(x, deciles):
  for dec_nr, dec_val in enumerate(deciles):
    decile = dec_nr
    if x <= dec_val:
      break
  return decile + 1

In [None]:
def features_extraction(df):

  df['Cabin_deck']=df['Cabin'].apply(lambda x: x[:1])
  df['Cabin_side']=df['Cabin'].apply(lambda x: x[-1:])
  df['Cabin_num/100']=df['Cabin'].apply(lambda x: int(round(int(x[2:-2])/100,0)))

  df['Cabin_persons']=df['Cabin'].apply(lambda x: df['Cabin'].value_counts()[x])

  deciles = calculate_deciles(df['Age'])
  df['Age_deciles'] = df['Age'].apply(lambda x: get_decile(x, deciles))

  deciles = calculate_deciles(df['RoomService'])
  df['RService_deciles'] = df['RoomService'].apply(lambda x: get_decile(x, deciles))

  deciles = calculate_deciles(df['FoodCourt'])
  df['FCourt_deciles'] = df['FoodCourt'].apply(lambda x: get_decile(x, deciles))

  deciles = calculate_deciles(df['ShoppingMall'])
  df['ShMall_deciles'] = df['ShoppingMall'].apply(lambda x: get_decile(x, deciles))

  deciles = calculate_deciles(df['Spa'])
  df['Spa_deciles'] = df['Spa'].apply(lambda x: get_decile(x, deciles))

  deciles = calculate_deciles(df['VRDeck'])
  df['VRD_deciles'] = df['VRDeck'].apply(lambda x: get_decile(x, deciles))

  df.drop(columns=['Name', 'Cabin', 'Age', 'RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], inplace=True)

  return df

In [None]:
def get_features(filename):
  df = read_data(filename)
  df = data_cleaning(df)
  df = features_extraction(df)
  return df

In [None]:
def categories_one_hot_encoding(df):
  dataset = df.copy()
  cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_deck', 'Cabin_side', 'Cabin_num/100', 'Cabin_persons', 'Age_deciles', 'RService_deciles', 'FCourt_deciles', 'ShMall_deciles', 'Spa_deciles', 'VRD_deciles']
  dataset = pd.get_dummies(dataset, columns=cols)
  return dataset

In [None]:
def calculate_targets(dataset, col):
  targets = {}
  classes = dataset[col].unique().tolist()
  for cls in classes:
    targets[cls] = len( dataset[ (dataset[col]==cls) & (dataset['Transported']==True) ] ) / len(dataset[dataset[col]==cls])
  return targets

def categories_target_encoding(df):
  dataset = df.copy()
  cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_deck', 'Cabin_side', 'Cabin_num/100', 'Cabin_persons', 'Age_deciles', 'RService_deciles', 'FCourt_deciles', 'ShMall_deciles', 'Spa_deciles', 'VRD_deciles']
  for col in cols:
    targets = calculate_targets(dataset, col)
    dataset[col+'_transformed'] = dataset[col].apply(lambda x: targets[x])
  dataset.drop(columns=cols, inplace=True)
  return dataset

In [None]:
def categories_leave_one_out_encoding(df):
  dataset = df.copy()
  cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_deck', 'Cabin_side', 'Cabin_num/100', 'Cabin_persons', 'Age_deciles', 'RService_deciles', 'FCourt_deciles', 'ShMall_deciles', 'Spa_deciles', 'VRD_deciles']
  loo_encoder = ce.LeaveOneOutEncoder(sigma=0.05)
  transformed = loo_encoder.fit_transform(dataset[cols].astype(str), dataset['Transported'])
  
  newcols = {}
  for col in cols:
    newcols[col] = col+'_transformed'
  transformed.rename(columns=newcols, inplace=True)

  dataset = dataset[['PassengerId', 'Transported']].join(transformed)
  return dataset