In [1]:
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_categorical_dtype
import janitor

In [2]:
train = pd.read_csv("../data/train_new.csv")
test = pd.read_csv("../data/test_new.csv")

In [3]:
train

Unnamed: 0,passengerid,homeplanet,cryosleep,cabin,destination,age,vip,roomservice,foodcourt,shoppingmall,spa,vrdeck,name,transported,side
0,2774_02,Earth,False,F/575/P,TRAPPIST-1e,17.0,False,0.0,1195.0,31.0,0.0,0.0,Crisey Mcbriddley,0,P
1,8862_04,Europa,True,C/329/S,55 Cancri e,28.0,False,0.0,0.0,0.0,0.0,0.0,Alramix Myling,1,S
2,8736_02,Mars,False,F/1800/P,TRAPPIST-1e,20.0,False,0.0,2.0,289.0,976.0,0.0,Tros Pota,1,P
3,0539_02,Europa,True,C/18/P,55 Cancri e,36.0,False,0.0,0.0,0.0,0.0,0.0,Achyon Nalanet,1,P
4,7696_02,Mars,False,F/1601/P,TRAPPIST-1e,37.0,False,1000.0,0.0,80.0,40.0,0.0,Ars Ches,0,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6042,0504_01,Europa,True,B/19/S,55 Cancri e,18.0,False,0.0,0.0,0.0,0.0,0.0,Thabius Unpasine,1,S
6043,6633_01,Europa,False,B/255/S,TRAPPIST-1e,26.0,False,0.0,0.0,0.0,0.0,0.0,Nunkib Motive,1,S
6044,5756_06,Earth,False,F/1194/P,PSO J318.5-22,22.0,False,158.0,0.0,476.0,0.0,26.0,Karena Briggston,0,P
6045,0925_01,Mars,False,F/191/P,TRAPPIST-1e,34.0,False,379.0,0.0,1626.0,0.0,0.0,Skix Kraie,0,P


In [4]:
train = train.drop(["cabin", "name", "passengerid"], axis=1)
test = test.drop(["cabin", "name", "passengerid"], axis = 1)

In [5]:
train["homeplanet"] = train["homeplanet"].astype("category")
train["cryosleep"] = train["cryosleep"].astype("category")
#train["cabin"] = train["cabin"].astype("category")
train["destination"] = train["destination"].astype("category")
train["vip"] = train["vip"].astype("category")
train["side"] = train["side"].astype("category")

In [6]:
cat_col = []
num_col = []
for col in list(train.columns):
    if is_categorical_dtype(train[col]):
        cat_col.append(col)
    elif is_numeric_dtype(train[col]):
        num_col.append(col)

In [7]:
num_col, len(num_col)

(['age',
  'roomservice',
  'foodcourt',
  'shoppingmall',
  'spa',
  'vrdeck',
  'transported'],
 7)

In [8]:
num_col.remove("transported")

In [9]:
cat_col

['homeplanet', 'cryosleep', 'destination', 'vip', 'side']

In [10]:
train

Unnamed: 0,homeplanet,cryosleep,destination,age,vip,roomservice,foodcourt,shoppingmall,spa,vrdeck,transported,side
0,Earth,False,TRAPPIST-1e,17.0,False,0.0,1195.0,31.0,0.0,0.0,0,P
1,Europa,True,55 Cancri e,28.0,False,0.0,0.0,0.0,0.0,0.0,1,S
2,Mars,False,TRAPPIST-1e,20.0,False,0.0,2.0,289.0,976.0,0.0,1,P
3,Europa,True,55 Cancri e,36.0,False,0.0,0.0,0.0,0.0,0.0,1,P
4,Mars,False,TRAPPIST-1e,37.0,False,1000.0,0.0,80.0,40.0,0.0,0,P
...,...,...,...,...,...,...,...,...,...,...,...,...
6042,Europa,True,55 Cancri e,18.0,False,0.0,0.0,0.0,0.0,0.0,1,S
6043,Europa,False,TRAPPIST-1e,26.0,False,0.0,0.0,0.0,0.0,0.0,1,S
6044,Earth,False,PSO J318.5-22,22.0,False,158.0,0.0,476.0,0.0,26.0,0,P
6045,Mars,False,TRAPPIST-1e,34.0,False,379.0,0.0,1626.0,0.0,0.0,0,P


In [11]:
ordi = OrdinalEncoder(categories=[[False, True],
                                  [False, True],
                                  ["P", "S"]])
minmax = MinMaxScaler()
onehot = OneHotEncoder()

In [12]:
ct = ColumnTransformer([("ord", ordi, ["cryosleep", "vip", "side"]), 
                        ("onehot", onehot, ["destination", "homeplanet"]),
                        ("minmax", minmax, num_col)], remainder="passthrough")

In [13]:
ct.fit(train)

In [14]:
train_use = ct.transform(train)
new_features = list(ct.get_feature_names_out())
for i in range(len(new_features)):
    new_features[i] = new_features[i].split("__")[1]
train_df = pd.DataFrame(train_use, columns= new_features)
train_df = train_df.clean_names()

In [15]:
test_use = ct.transform(test)
#new_features = list(ct.get_feature_names_out())
test_df = pd.DataFrame(test_use, columns=new_features)

In [16]:
test_df = test_df.clean_names()

In [17]:
train_df.to_csv("../data/train_fe.csv", index=False)
test_df.to_csv("../data/test_fe.csv", index=False)

In [18]:
train_df

Unnamed: 0,cryosleep,vip,side,destination_55_cancri_e,destination_pso_j318_5_22,destination_trappist_1e,homeplanet_earth,homeplanet_europa,homeplanet_mars,age,roomservice,foodcourt,shoppingmall,spa,vrdeck,transported
0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.215190,0.000000,0.043105,0.001320,0.000000,0.000000,0.0
1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.354430,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.253165,0.000000,0.000072,0.012302,0.052552,0.000000,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.455696,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.468354,0.100806,0.000000,0.003405,0.002154,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6042,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.227848,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
6043,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.329114,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
6044,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.278481,0.015927,0.000000,0.020262,0.000000,0.001077,0.0
6045,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.430380,0.038206,0.000000,0.069215,0.000000,0.000000,0.0
