# Notebook de préparation des données

Pour rappel : 
* Drop : Ticket, Name (TODO:Reprendre & améliorer)
* One-hot encoder : Sex, Embarked, board of Cabin, Title
  * ne pas oublier de drop 1 pour éviter la colinéarité
* Flag encoder : deck of Cabin
* transformation log(x+1) : SibSp, Parch, Fare
* Knn-imputer : Age
* Keep same : Pclass, Survived

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from data_preparation import apply_one_hot_encoding

In [2]:
df = pd.read_csv("./data/train.csv", sep=",")

In [3]:
# 1. Drops
simplified_df = df.drop(columns=["Ticket"])

In [4]:
# One-hot encoding
sex_scaler = OneHotEncoder(handle_unknown='ignore')
embarked_scaler = OneHotEncoder(handle_unknown='ignore')

sex_scaler.fit(simplified_df[["Sex"]])
embarked_scaler.fit(simplified_df[["Embarked"]])

sex_columns = ["Sex_" + col for col in sex_scaler.categories_[0]]
embarked_columns = ["Embarked_" + col for col in embarked_scaler.categories_[0]]

simplified_df[sex_columns[:-1]] = sex_scaler.transform(simplified_df[["Sex"]]).toarray()[:, :-1]
simplified_df[embarked_columns[:-1]] = embarked_scaler.transform(simplified_df[["Embarked"]]).toarray()[:, :-1]

simplified_df.drop(columns=["Sex", "Embarked"], inplace=True)

In [5]:
# Transformation log : SibSp, Parch, Fare
simplified_df["SibSp"] = simplified_df["SibSp"].apply(lambda x: np.log(x+1))
simplified_df["Parch"] = simplified_df["Parch"].apply(lambda x: np.log(x+1))
simplified_df["Fare"] = simplified_df["Fare"].apply(lambda x: np.log(x+1))

In [6]:
# kNN imputer : Age
min_max_scaler = MinMaxScaler()
df_min_max_scaled = min_max_scaler.fit_transform(simplified_df.drop(columns=["Cabin", "Name"]))

knn_imputer = KNNImputer(n_neighbors=3, weights='uniform', metric='nan_euclidean')
data_imputed = knn_imputer.fit_transform(df_min_max_scaled)

descaled_data_imputed = min_max_scaler.inverse_transform(data_imputed)

simplified_df["Age"] = descaled_data_imputed[:, list(min_max_scaler.feature_names_in_).index("Age")]

In [7]:
# Cabin transformation
decks_encoding = "TABCDEFGHI"

def transform_cabin_deck(x):
    if pd.isna(x): return np.nan
    return np.mean([decks_encoding.index(c[0]) for c in x.split(" ")])

def extract_cabin_bord(x):
    if pd.isna(x): return np.nan
    return np.median([int(c[1:])%2==0 if len(c[1:]) > 0 else np.nan for c in x.split(" ")])

simplified_df["nb_cabins"] = simplified_df["Cabin"].str.count(" ") + 1
simplified_df["board"] = simplified_df["Cabin"].apply(extract_cabin_bord)
simplified_df["deck"] = simplified_df["Cabin"].apply(transform_cabin_deck)
simplified_df.drop(columns=["Cabin"], inplace=True)

In [24]:
# Name transformation
names_df = pd.read_csv("./data/names.csv")

def split_last(L, s):
    if isinstance(s, str):
        return L[:-1] + L[-1].split(s)
    elif isinstance(s, list):
        for s_ in s:
            if s_ in L[-1]:
                return L[:-1] + L[-1].split(s_)
    return L

def split_name(x):
    x = x.split(",")
    x = split_last(x, ".")
    x = split_last(x, ["(", "\""])
    return x

# Agregated names info
name_df = simplified_df["Name"].apply(lambda x: split_name(x)).apply(pd.Series)

# Family names to region
simplified_df["Zones"] = (
    pd.Series(name_df[0], name="last_name")
    .to_frame()
    .merge(names_df, how="left", left_on="last_name", right_on="Name")
    .loc[:,"Zone"]
)

# Title
simplified_df["Title"] = name_df[1]

In [None]:
simplified_df, _ = apply_one_hot_encoding(simplified_df, "Zones")
simplified_df, _ = apply_one_hot_encoding(simplified_df, "Title")

In [29]:
simplified_df.drop(columns=["Name"], inplace=True)

In [34]:
simplified_df.to_csv("./data/train_prepared.csv")

## Test functions

In [1]:
import pandas as pd
import data_preparation

In [2]:
train_df = pd.read_csv("./data/train.csv", sep=",")
test_df = pd.read_csv("./data/test.csv", sep=",")

X_train = train_df.drop(columns=["Survived"])
y_train = train_df.loc[:, "Survived"]

X_train, preparation_model = data_preparation.prepare_df(X_train)
X_test = data_preparation.transform_df(test_df, preparation_model)