In [1]:
import pandas as pd 
import numpy as np 
from sklearn.base import BaseEstimator, TransformerMixin

# Load Data set 

In [2]:
df_tr = pd.read_csv("../res/train.csv")
df_test = pd.read_csv("../res/test.csv")

In [42]:
df_tr.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# Fill Na 

In [44]:
# list of columns with missing values in train and test
df_tr.isna().any()

PassengerId     False
HomePlanet       True
CryoSleep        True
Cabin            True
Destination      True
Age              True
VIP              True
RoomService      True
FoodCourt        True
ShoppingMall     True
Spa              True
VRDeck           True
Name             True
Transported     False
dtype: bool

In [9]:
df_tr

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [62]:
# Fit
df_tr2 = df_tr.copy() 
df_tr2['GroupId'] = df_tr2['PassengerId'].str[0:4].astype(int)
df_tr2 = df_tr2[['GroupId', 'HomePlanet']]
df_tr2 = df_tr2.dropna()
mapping = df_tr2.groupby("GroupId").agg("first")['HomePlanet']

In [94]:
type(mapping)

pandas.core.series.Series

In [92]:
# Transform
df_tr3 = df_tr.copy()
df_tr3['GroupId'] = df_tr3['PassengerId'].str[0:4].astype(int)
df_tr3 = df_tr3[['PassengerId', 'GroupId', 'HomePlanet']]
df_tr3 = df_tr3[df_tr3['HomePlanet'].isna()]
df_tr3['HomePlanet'] = df_tr3['GroupId'].map(mapping)
df_tr2.loc[df_tr3.index,'HomePlanet'] = df_tr3['HomePlanet'].values

## Missing Value Strategy 
- Median :  Age, RoomService, FoodCourt, ShoppingMall , Spa , VRDeck
- Unknown Values : 

In [None]:
class ImputerHomePlanet(BaseEstimator,TransformerMixin): 
    def __init__(self) -> None:
        self.src_col = "HomePlanet"
        self.mapping = pd.Series()
        super().__init__()
    
    
    def fit(self, X, Y=None): 
        X = X.copy() 
        X['GroupId'] = X['PassengerId'].str[0:4].astype(int)
        X = X[['GroupId', self.src_col]]
        X = X.dropna()
        self.mapping = X.groupby("GroupId").agg("first")[self.src_col]
        return self
    
    def transform(self, X, y=None): 
        X = X.copy()
        X_na = X.copy()
        X_na['GroupId'] = X_na['PassengerId'].str[0:4].astype(int)
        X_na = X_na[['GroupId', self.src_col]]
        X_na = X_na[X_na[self.src_col].isna()]
        X_na[self.src_col] = X_na['GroupId'].map(self.mapping)
        X.loc[X_na.index,self.src_col] = X_na[self.src_col].values
        return X

In [None]:
class ImputerDestination(BaseEstimator,TransformerMixin): 
    def __init__(self) -> None:
        self.src_col = "Destination"
        self.mapping = pd.Series()
        super().__init__()
    
    
    def fit(self, X, Y=None): 
        X = X.copy() 
        X['GroupId'] = X['PassengerId'].str[0:4].astype(int)
        X = X[['GroupId', self.src_col]]
        X = X.dropna()
        self.mapping = X.groupby("GroupId").agg("first")[self.src_col]
        return self
    
    def transform(self, X, y=None): 
        X = X.copy()
        X_na = X.copy()
        X_na['GroupId'] = X_na['PassengerId'].str[0:4].astype(int)
        X_na = X_na[['GroupId', self.src_col]]
        X_na = X_na[X_na[self.src_col].isna()]
        X_na[self.src_col] = X_na['GroupId'].map(self.mapping)
        X.loc[X_na.index,self.src_col] = X_na[self.src_col].values
        return X

In [None]:
na_mean_features = ['Age','RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck'] 
na_unknown_features = ['HomePlanet','Destination','Deck','Num','Side']
na_default_features = ['CryoSleep', 'VIP']
na_constant_features = ['Num']

mean_missing_tr = SimpleImputer(missing_values = np.nan, strategy = 'mean')
unknown_missing_tr = SimpleImputer(missing_values = np.nan, strategy = 'constant', fill_value = 'Unknown')
default_tr = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

ct_na = ColumnTransformer(transformers = [('mean', mean_missing_tr , na_mean_features),
                                          ('unknown', unknown_missing_tr, na_unknown_features), 
                                          ('default', default_tr, na_default_features)], 
                       remainder = 'passthrough', verbose_feature_names_out = False) 
ct_na.set_output(transform='pandas')

In [130]:
fit_column = ['PassengerId', 'HomePlanet','Destination', 'Cabin','Age','Name']
df_fit = pd.concat([df_tr[fit_column], df_test[fit_column]])
df_fit

Unnamed: 0,PassengerId,HomePlanet,Destination,Cabin,Age,Name
0,0001_01,Europa,TRAPPIST-1e,B/0/P,39.0,Maham Ofracculy
1,0002_01,Earth,TRAPPIST-1e,F/0/S,24.0,Juanna Vines
2,0003_01,Europa,TRAPPIST-1e,A/0/S,58.0,Altark Susent
3,0003_02,Europa,TRAPPIST-1e,A/0/S,33.0,Solam Susent
4,0004_01,Earth,TRAPPIST-1e,F/1/S,16.0,Willy Santantines
...,...,...,...,...,...,...
4272,9266_02,Earth,TRAPPIST-1e,G/1496/S,34.0,Jeron Peter
4273,9269_01,Earth,TRAPPIST-1e,,42.0,Matty Scheron
4274,9271_01,Mars,55 Cancri e,D/296/P,,Jayrin Pore
4275,9273_01,Europa,,D/297/P,,Kitakan Conale


In [107]:
home_input = ImputerHomePlanet()
home_input.fit(df_fit)
df_trX = home_input.transform(df_tr)

  self.mapping = pd.Series()


In [102]:
np.sum(df_tr['HomePlanet'].isna())

201

In [108]:
np.sum(df_trX['HomePlanet'].isna())

111

In [131]:
df_fit['GroupId'] = df_fit['PassengerId'].str[0:4].astype(int)

In [116]:
df_cabin = df_fit.groupby("GroupId")['Cabin'].agg(['unique','nunique'])

In [123]:
df_cabin[df_cabin['nunique'] > 2]

Unnamed: 0_level_0,unique,nunique
GroupId,Unnamed: 1_level_1,Unnamed: 2_level_1
220,"[G/37/P, E/10/P, F/48/P]",3
461,"[E/27/S, F/85/S, G/66/S]",3
632,"[F/134/P, E/37/P, G/97/P]",3
720,"[F/129/S, E/52/S, D/26/S]",3
938,"[F/172/S, E/68/S, G/143/S]",3
1354,"[F/263/S, E/99/S, D/47/S]",3
1384,"[E/105/S, G/210/S, F/269/S]",3
1709,"[F/326/S, D/61/S, nan, E/127/S]",3
2092,"[D/70/S, E/153/S, nan, F/410/S]",3
2234,"[F/448/P, E/134/P, D/66/P]",3


In [134]:
df_fit[df_fit['GroupId'] == 461]

Unnamed: 0,PassengerId,HomePlanet,Destination,Cabin,Age,Name,GroupId
427,0461_01,Earth,PSO J318.5-22,E/27/S,31.0,Shany Yorkland,461
428,0461_02,Earth,TRAPPIST-1e,F/85/S,22.0,Warry Yorkland,461
429,0461_03,Earth,55 Cancri e,G/66/S,1.0,Len Yorkland,461


In [139]:
df_tr2 = df_tr.copy()
df_tr2['GroupId'] = df_tr2['PassengerId'].str[0:4].astype(int)

In [142]:
df_transported = df_tr2.groupby("GroupId")['Transported'].agg(['unique','nunique'])
df_transported[df_transported['nunique'] > 1] 

Unnamed: 0_level_0,unique,nunique
GroupId,Unnamed: 1_level_1,Unnamed: 2_level_1
17,"[True, False]",2
20,"[False, True]",2
44,"[False, True]",2
45,"[False, True]",2
67,"[True, False]",2
...,...,...
9227,"[False, True]",2
9231,"[True, False]",2
9272,"[True, False]",2
9275,"[True, False]",2


In [152]:
df_tr2[df_tr2['GroupId'] == 45]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId
46,0045_01,Mars,False,F/10/P,TRAPPIST-1e,21.0,False,970.0,0.0,180.0,0.0,64.0,Zelowl Chmad,False,45
47,0045_02,Mars,True,F/10/P,,19.0,False,0.0,0.0,0.0,0.0,0.0,Mass Chmad,True,45


# Extract Group ID information 
We know from the description that PassengerID take the form <code>gggg_pp</code> where : 
- <code>gggg</code> group number with who the passenger is travelling with
- <code>pp</code> Id of this person in the group

We will not be interested by the Person Id within the group, but the group number could be an useful information as it will allow us to cluster Passenger together. It might not be surprising to have passenger in the same group having a higher chance to be in the same final category (meaning Transported or not transported). 

In [10]:
class ExtractGroupId(BaseEstimator, TransformerMixin): 
    def __init__(self) -> None:
        self.src_col = 'PassengerId'
        super().__init__()
    
    def fit(self, X, y=None): 
        return self 
    
    def transform(self, X, y=None):
        X = X.copy()
        index_source_col = X.columns.get_indexer([self.src_col])[0]
        parse_data = X[self.src_col].str.split("_", n=1 , expand = True)
        X.insert(int(index_source_col) + 1, "GroupId", parse_data[0].apply(int))
        X = X.drop(columns = self.src_col)
        return X

# Extract Cabin Information

In [40]:
class ExtractCabinInfo(BaseEstimator,TransformerMixin):
    def __init__(self) -> None:
        self.src_col = 'Cabin'
        super().__init__()
    
    def fit(self, X, y=None): 
        return self
    
    def transform(self, X , y=None): 
        X = X.copy()
        index_source_col = X.columns.get_indexer([self.src_col])[0]
        parse_data = X[self.src_col].str.split("/", n=3 , expand = True)
        X.insert(int(index_source_col) + 1, "Deck", parse_data[0])
        X.insert(int(index_source_col) + 2, "Num", parse_data[1])
        X.insert(int(index_source_col) + 3, "Side", parse_data[2])
        X = X.drop(columns = self.src_col)
        return X