# Notebook de préparation des données

Pour rappel : 
* Drop : Ticket, Cabin, Name (TODO:Reprendre & améliorer)
* One-hot encoder : Sex, Embarked
  * ne pas oublier de drop 1 pour éviter la colinéarité
* transformation log(x+1) : SibSp, Parch, Fare
* Knn-imputer : Age
* Keep same : Pclass, Survived

In [74]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [75]:
df = pd.read_csv("./data/train.csv", sep=",")

In [76]:
# 1. Drops
simplified_df = df.drop(columns=["Ticket", "Cabin", "Name"])

In [77]:
# One-hot encoding
sex_scaler = OneHotEncoder(handle_unknown='ignore')
embarked_scaler = OneHotEncoder(handle_unknown='ignore')

sex_scaler.fit(simplified_df[["Sex"]])
embarked_scaler.fit(simplified_df[["Embarked"]])

sex_columns = ["Sex_" + col for col in sex_scaler.categories_[0]]
embarked_columns = ["Embarked_" + col for col in embarked_scaler.categories_[0]]

simplified_df[sex_columns[:-1]] = sex_scaler.transform(simplified_df[["Sex"]]).toarray()[:, :-1]
simplified_df[embarked_columns[:-1]] = embarked_scaler.transform(simplified_df[["Embarked"]]).toarray()[:, :-1]

simplified_df.drop(columns=["Sex", "Embarked"], inplace=True)

In [78]:
# Transformation log : SibSp, Parch, Fare
simplified_df["SibSp"] = simplified_df["SibSp"].apply(lambda x: np.log(x+1))
simplified_df["Parch"] = simplified_df["Parch"].apply(lambda x: np.log(x+1))
simplified_df["Fare"] = simplified_df["Fare"].apply(lambda x: np.log(x+1))

In [80]:
simplified_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Age            177
SibSp            0
Parch            0
Fare             0
Sex_female       0
Embarked_C       0
Embarked_Q       0
dtype: int64

In [89]:
# kNN imputer
min_max_scaler = MinMaxScaler()
df_min_max_scaled = min_max_scaler.fit_transform(simplified_df)

knn_imputer = KNNImputer(n_neighbors=3, weights='uniform', metric='nan_euclidean')
data_imputed = knn_imputer.fit_transform(df_min_max_scaled)

descaled_data_imputed = min_max_scaler.inverse_transform(data_imputed)

descaled_imputed_df = pd.DataFrame(descaled_data_imputed, columns=min_max_scaler.feature_names_in_)

In [91]:
descaled_imputed_df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_female     0
Embarked_C     0
Embarked_Q     0
dtype: int64

In [92]:
descaled_imputed_df.to_csv("./data/train_prepared.csv")