In [73]:
import pandas as pd

# Load Data

In [74]:
df = pd.read_csv('datasets/train.csv', index_col='PassengerId')
df.drop("Name", axis=1, inplace=True)
df.drop("Ticket", axis=1, inplace=True)
df.pop("Cabin")
df.head(10)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S
6,0,3,male,,0,0,8.4583,Q
7,0,1,male,54.0,0,0,51.8625,S
8,0,3,male,2.0,3,1,21.075,S
9,1,3,female,27.0,0,2,11.1333,S
10,1,2,female,14.0,1,0,30.0708,C


In [96]:
classCol = df.pop("Survived")
classCol

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
887    0
888    1
889    0
890    1
891    0
Name: Survived, Length: 891, dtype: int64

# Obtain Categorical and Numerical Columns

In [75]:
categoricalCols = df.columns[df.dtypes == 'object']
categoricalCols

Index(['Sex', 'Embarked'], dtype='object')

In [76]:
categoricalCols = df.select_dtypes("object").columns
categoricalCols

Index(['Sex', 'Embarked'], dtype='object')

In [77]:
dfNum = df.drop(categoricalCols, axis = 1)
dfCat = df[categoricalCols]

# Process nulls in Categorical Variables

In [78]:
from sklearn.impute import SimpleImputer

impCat = SimpleImputer(strategy = 'most_frequent')
columns = dfCat.columns
index = dfCat.index
dfCat = pd.DataFrame(impCat.fit_transform(dfCat), columns = columns, index = index)

In [79]:
dfCat.isnull().any()

Sex         False
Embarked    False
dtype: bool

In [80]:
dfCat

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,S
2,female,C
3,female,S
4,female,S
5,male,S
...,...,...
887,male,S
888,female,S
889,female,S
890,male,C


# Convert Categorical into Numerical

In [81]:
from sklearn import preprocessing
from collections import defaultdict

d = defaultdict(preprocessing.LabelEncoder)

dfCatLe = dfCat.apply(lambda col: d[col.name].fit_transform(col))
dfCatLe.head(10)

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
2,0,0
3,0,2
4,0,2
5,1,2
6,1,1
7,1,2
8,1,2
9,0,2
10,0,0


In [82]:
inverse = dfCatLe.apply(lambda col: d[col.name].inverse_transform(col))
inverse

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,S
2,female,C
3,female,S
4,female,S
5,male,S
...,...,...
887,male,S
888,female,S
889,female,S
890,male,C


In [83]:
transformed = inverse.apply(lambda col: d[col.name].transform(col))
transformed

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
2,0,0
3,0,2
4,0,2
5,1,2
...,...,...
887,1,2
888,0,2
889,0,2
890,1,0


Disadvantage: LabelEncoder introduces a "false order" in categories. We can use OneHotEncoder instead, that fixes this problem.

In [84]:
dfCat

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,S
2,female,C
3,female,S
4,female,S
5,male,S
...,...,...
887,male,S
888,female,S
889,female,S
890,male,C


In [85]:
ohe = preprocessing.OneHotEncoder(sparse = False)
dfCatOhe = pd.DataFrame(ohe.fit_transform(dfCat),
                        columns = ohe.get_feature_names_out(dfCat.columns.tolist()),
                        index = dfCat.index)

In [86]:
dfCatOhe

Unnamed: 0_level_0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0
5,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...
887,0.0,1.0,0.0,0.0,1.0
888,1.0,0.0,0.0,0.0,1.0
889,1.0,0.0,0.0,0.0,1.0
890,0.0,1.0,1.0,0.0,0.0


# Process nulls in Numerical Variables

In [91]:
impNum = SimpleImputer(strategy = "mean")
columns = dfNum.columns
index = dfNum.index
dfNum = pd.DataFrame(impNum.fit_transform(dfNum), columns = columns, index = index)
dfNum.isnull().any()

Survived    False
Pclass      False
Age         False
SibSp       False
Parch       False
Fare        False
dtype: bool

In [92]:
dfNum

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,3.0,22.000000,1.0,0.0,7.2500
2,1.0,1.0,38.000000,1.0,0.0,71.2833
3,1.0,3.0,26.000000,0.0,0.0,7.9250
4,1.0,1.0,35.000000,1.0,0.0,53.1000
5,0.0,3.0,35.000000,0.0,0.0,8.0500
...,...,...,...,...,...,...
887,0.0,2.0,27.000000,0.0,0.0,13.0000
888,1.0,1.0,19.000000,0.0,0.0,30.0000
889,0.0,3.0,29.699118,1.0,2.0,23.4500
890,1.0,1.0,26.000000,0.0,0.0,30.0000


Now, we merge both processed dataframes (categorical and numerical)

In [94]:
dfPreprocessed = pd.merge(left = dfCatOhe,
                          right = dfNum,
                          on = "PassengerId")
dfPreprocessed

Unnamed: 0_level_0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,1.0,0.0,0.0,1.0,0.0,3.0,22.000000,1.0,0.0,7.2500
2,1.0,0.0,1.0,0.0,0.0,1.0,1.0,38.000000,1.0,0.0,71.2833
3,1.0,0.0,0.0,0.0,1.0,1.0,3.0,26.000000,0.0,0.0,7.9250
4,1.0,0.0,0.0,0.0,1.0,1.0,1.0,35.000000,1.0,0.0,53.1000
5,0.0,1.0,0.0,0.0,1.0,0.0,3.0,35.000000,0.0,0.0,8.0500
...,...,...,...,...,...,...,...,...,...,...,...
887,0.0,1.0,0.0,0.0,1.0,0.0,2.0,27.000000,0.0,0.0,13.0000
888,1.0,0.0,0.0,0.0,1.0,1.0,1.0,19.000000,0.0,0.0,30.0000
889,1.0,0.0,0.0,0.0,1.0,0.0,3.0,29.699118,1.0,2.0,23.4500
890,0.0,1.0,1.0,0.0,0.0,1.0,1.0,26.000000,0.0,0.0,30.0000


# Feature Selection

## K-Best with Chi-Square

In [99]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

fsKBestChi2 = SelectKBest(chi2, k = 4)
fsKBestChi2.fit(dfPreprocessed, classCol)
colFilter = fsKBestChi2.get_support()
dfKBestChi2 = dfPreprocessed.iloc[:, colFilter]

dfKBestChi2

Unnamed: 0_level_0,Sex_female,Sex_male,Survived,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,1.0,0.0,7.2500
2,1.0,0.0,1.0,71.2833
3,1.0,0.0,1.0,7.9250
4,1.0,0.0,1.0,53.1000
5,0.0,1.0,0.0,8.0500
...,...,...,...,...
887,0.0,1.0,0.0,13.0000
888,1.0,0.0,1.0,30.0000
889,1.0,0.0,0.0,23.4500
890,0.0,1.0,1.0,30.0000


## 40% Original Variables, with Mutual Information Criterion

In [100]:
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import mutual_info_classif

fsPercMi = SelectPercentile(mutual_info_classif, percentile = 40)
fsPercMi.fit(dfPreprocessed, classCol)
colFilter = fsPercMi.get_support()
dfPercMi = dfPreprocessed.iloc[:, colFilter]
dfPercMi

Unnamed: 0_level_0,Sex_female,Sex_male,Survived,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,1.0,0.0,7.2500
2,1.0,0.0,1.0,71.2833
3,1.0,0.0,1.0,7.9250
4,1.0,0.0,1.0,53.1000
5,0.0,1.0,0.0,8.0500
...,...,...,...,...
887,0.0,1.0,0.0,13.0000
888,1.0,0.0,1.0,30.0000
889,1.0,0.0,0.0,23.4500
890,0.0,1.0,1.0,30.0000


# Discretize Variables

In [102]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
dfNumScale = mms.fit_transform(dfNum.drop(["Pclass","SibSp","Parch"], axis = 1))
dfNumScale

array([[0.        , 0.27117366, 0.01415106],
       [1.        , 0.4722292 , 0.13913574],
       [1.        , 0.32143755, 0.01546857],
       ...,
       [0.        , 0.36792055, 0.04577135],
       [1.        , 0.32143755, 0.0585561 ],
       [0.        , 0.39683338, 0.01512699]])

# Standarize Variables

In [103]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
dfNumSScale = ss.fit_transform(dfNum.drop(["Pclass","SibSp","Parch"], axis = 1))
dfNumSScale

array([[-0.78927234, -0.5924806 , -0.50244517],
       [ 1.2669898 ,  0.63878901,  0.78684529],
       [ 1.2669898 , -0.2846632 , -0.48885426],
       ...,
       [-0.78927234,  0.        , -0.17626324],
       [ 1.2669898 , -0.2846632 , -0.04438104],
       [-0.78927234,  0.17706291, -0.49237783]])