# Titanic machine learning from disaster

## Imports

In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.feature_selection import RFE
import numpy as np

import warnings
from collections import Counter

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import BayesianRidge,LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import GradientBoostingClassifier

warnings.filterwarnings("ignore")

## Un helper pour plot la correlation matrix d'un dataframe

In [14]:
def plot_correlation(df,annot=True):
    plt.figure(figsize=(10,8))
    corr = df.corr()
    sns.heatmap(corr, annot=annot,cmap=sns.color_palette("vlag", as_cmap=True),cbar_kws={'shrink': .8},annot_kws={"size": 6} )
    plt.title("Correlation Matrix Heatmap")
    plt.show()

## Deux classes et un helper:
- CustomFunctionTransformer(FunctionTransformer) est une classe enfant de FunctionTransformer qui hérite de sa classe parent mais à laquelle on greffe des attributs cols_in,cols_out
- Func_To_Transformer est une classe qui a des attributs :
    - func : la fonction à appliquer aux éléments des colonnes
    - cols_in : les colonnes sur lesquelles appliquer func
    - cols_out : le nom des colonnes en sortie de func appliquée à cols_in
    - get_transformer (@property) : une instance de FunctionTransformer qui applique proprement func
- le helper get_feature_names_from_column_transformer : renvoie les noms de colonnes après application du cColumnTransformer ct, avec des noms de colonnes en entrée "input_features"
- le helper to_pandas qui remet au format DataFrame la sortie du columntransformer

In [15]:
class CustomFunctionTransformer(FunctionTransformer):
    def __init__(self,func,cols_in,cols_out):
        FunctionTransformer.__init__(self,func)
        self.cols_in,self.cols_out=cols_in,cols_out
        
#écrire une class avec methode func,get_cols_in,get_cols_out
class Func_To_Transformer():
    def __init__(self,func,cols_in,cols_out):
        self.func=func
        self.size_in,self.size_out=len(cols_in),len(cols_out)
        self.cols_in,self.cols_out=cols_in,cols_out
    @property
    def get_transformer(self):
        def out_func(data):
        # Handle pandas to numpy conversion
            if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
                data = data.values
            # Ensure data has the correct shape
            if data.ndim != 2 or data.shape[1] != self.size_out:
                data = data.reshape(data.shape[0], self.size_in)  # Reshape based on k_in
            # Apply transformation function row-by-row and reshape output
            transformed_data = np.array([self.func(row[0]) for row in data])
            return transformed_data.reshape(data.shape[0], self.size_out)
        return CustomFunctionTransformer(out_func,self.cols_in,self.cols_out)

# Helper function to get feature names from a fitted ColumnTransformer including transformers that do not have the set_ouput(transform="pandas") method
def get_feature_names_from_column_transformer(ct, input_features):
    feature_names = []
    passthrough_columns = [col for col in input_features if not any(col in columns for _,_,columns in ct.transformers_)]
    print(f"passthrough columns={passthrough_columns}")

    for name, transformer, columns in ct.transformers_:
        if transformer == 'passthrough':
            # If passthrough, add the original column names directly without any prefix
            print(f"passthrough columns={columns}")
            feature_names.extend(columns)
        elif transformer == 'drop':
            continue
        else:
            try:
                if isinstance(transformer, Pipeline) and isinstance(transformer[0],TfidfVectorizer):
                    names=transformer[0].get_feature_names_out()
                elif isinstance(transformer, Pipeline):
                    last_step = transformer[-1]
                    if hasattr(last_step, 'get_feature_names_out'):
                        names = last_step.get_feature_names_out()
                    else:
                        names = columns
                elif isinstance(transformer,CustomFunctionTransformer):
                    names=transformer.cols_out
                else:
                    if hasattr(transformer, 'get_feature_names_out'):
                        names = transformer.get_feature_names_out(columns)
                    else:
                        names = columns
                feature_names.extend(names)
            except ValueError:
                continue

    # Add passthrough columns by their original names
    feature_names.extend(passthrough_columns)
    return feature_names

def to_pandas(transformed,new_feature_names):
    return pd.DataFrame(data=np.array(transformed), columns=new_feature_names)

## import des données d'entrainement, train_test_split

In [16]:
data = pd.read_csv('../data/train.csv')
#data.info()

In [17]:
dict(data.isnull().mean())

{'PassengerId': 0.0,
 'Survived': 0.0,
 'Pclass': 0.0,
 'Name': 0.0,
 'Sex': 0.0,
 'Age': 0.19865319865319866,
 'SibSp': 0.0,
 'Parch': 0.0,
 'Ticket': 0.0,
 'Fare': 0.0,
 'Cabin': 0.7710437710437711,
 'Embarked': 0.002244668911335578}

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("Survived", axis=1),
    data["Survived"],
    test_size=0.3,
    random_state=0,
)

In [19]:
df = X_train.copy()
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
857,858,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.55,E17,S
52,53,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C
386,387,3,"Goodwin, Master. Sidney Leonard",male,1.0,5,2,CA 2144,46.9,,S
124,125,1,"White, Mr. Percival Wayland",male,54.0,0,1,35281,77.2875,D26,S
578,579,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C


## Construction du pipeline : 
- d'abord les fonctions à appliquer à des colonnes spécifiques
- puis les trsnformeurs de colonnes dans un ColumnTransformer, utilisant la classe Func_To_Transformer pour les fonctions spécifiques
- puis le pipeline : ColumnTransformer+IterativeImputer+Modèle
- ensuite on fait un ".fit().score()" qui nous donne une idée de la performance de notre modèle

In [20]:
#helpers pour les colonnes

def isNaN(sum):
    return sum!=sum

def sex(s):
    return 1*(s=="male")

def ticket(t):
    try:
        return int(t.split()[-1])
    except:
        return np.nan
cab_nums=np.unique([c[0] for c in df.Cabin if not isNaN(c)])
print(f"cab_nums={cab_nums}")
def cabin(c):
    try:
        return np.array([[1*(v==c[0]) for v in cab_nums]+[int(c[1:])]])
    except:
        return np.array([[np.nan for _ in range (len(cab_nums)+1)]])
ports=np.unique([c[0] for c in df.Embarked if not isNaN(c)])
print(f"ports={ports}")
def embarked(p):
    try:
        return np.array([[1*(v==p[0]) for v in ports]])
    except:
        return np.array([[np.nan for _ in range (len(ports))]])
    
def name(name : np.ndarray[str]) -> np.ndarray[str]:
    name = np.array(name)
    name = name.flatten()
    name = [re.search(r' ([A-Za-z]+)\.', n).group(1) for n in name]
    rare_titles = ['Dr', 'Rev', 'Mlle', 'Major', 'Col', 'Countess', 'Capt', 'Ms', 'Sir', 'Lady', 'Mme', 'Don', 'Jonkheer']
    nouveaux_noms = [x if x in ['Mr', 'Mrs', 'Miss', 'Master'] else 'Rare' for x in name]
    return (nouveaux_noms)

cab_nums=['A' 'B' 'C' 'D' 'E' 'F' 'G' 'T']
ports=['C' 'Q' 'S']


In [21]:
cabin_transformer = Func_To_Transformer(cabin,["Cabin"],["Pont_"+v for v in cab_nums]+["NumCab"])
ticket_transformer = Func_To_Transformer(ticket,["Ticket"],["Ticket"])
sex_transformer = Func_To_Transformer(sex,["Sex"],["Sex"])
embarked_transformer = Func_To_Transformer(embarked,["Embarked"],["Port_"+p for p in ports]) #OneHotEncoder(handle_unknown='ignore', sparse_output=False)
name_transformer=Pipeline([('tfidf', TfidfVectorizer(max_features=100)),('to_dense', FunctionTransformer(lambda x: x.todense()))])

In [22]:
class NameExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, rare_titles=None):
        self.rare_titles = rare_titles if rare_titles else ['Dr', 'Rev', 'Mlle', 'Major', 'Col', 'Countess', 'Capt', 'Ms', 'Sir', 'Lady', 'Mme', 'Don', 'Jonkheer']
    
    def fit(self, X, y=None):
        return self  # Aucun apprentissage nécessaire pour ce transformateur
    
    def transform(self, X):
        # Si X est un DataFrame, accéder à la colonne sous forme de tableau
        if isinstance(X, pd.DataFrame):
            X = X.iloc[:, 0].values  # On prend la première colonne (Name)
        
        # Appliquer la transformation
        titles = []
        for name in X:
            match = re.search(r' ([A-Za-z]+)\.', name)
            title = match.group(1) if match else 'Unknown'
            title = title if title in ['Mr', 'Mrs', 'Miss', 'Master'] else 'Rare'
            titles.append(title)
        
        # Retourner un tableau 2D compatible avec OneHotEncoder
        return np.array(titles).reshape(-1, 1)



name_pipeline = Pipeline([
    ('name_extractor', NameExtractor()),  # Extraction des titres
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Encodage one-hot
])

In [23]:
GBC_pipeline = Pipeline([
                        ('column_tranformer',ColumnTransformer(transformers=[
                            ('cabin', cabin_transformer.get_transformer, cabin_transformer.cols_in),
                            ('ticket', ticket_transformer.get_transformer, ticket_transformer.cols_in),
                            ('sex', sex_transformer.get_transformer, sex_transformer.cols_in), 
                            ('embarked',embarked_transformer.get_transformer,embarked_transformer.cols_in),
                            ('tdidf', name_transformer, 'Name'),
                            ],
                            remainder='passthrough')),
                        ('to_array',FunctionTransformer(lambda x: x.A)),
                        ('scaler',StandardScaler()),
                        ('imputer',IterativeImputer(estimator=Ridge(),max_iter=1000,random_state=0,tol=1e-3)),
                        ('model',GradientBoostingClassifier())
                        ],verbose=True)  
GBC_score=GBC_pipeline.fit(X_train,y_train).score(X_test,y_test)
GBC_score

[Pipeline] . (step 1 of 5) Processing column_tranformer, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing to_array, total=   0.0s
[Pipeline] ............ (step 3 of 5) Processing scaler, total=   0.0s
[Pipeline] ........... (step 4 of 5) Processing imputer, total=  18.5s
[Pipeline] ............. (step 5 of 5) Processing model, total=   0.4s


0.832089552238806

In [24]:
GBC_pipeline

### Un autre pipeline fondé sur un autre modèle et un autre estimateur dans l'imputer, pour le fun

In [25]:
RF_KNN_pipeline = Pipeline([
                        ('column_tranformer',ColumnTransformer(transformers=[
                            ('cabin', cabin_transformer.get_transformer, cabin_transformer.cols_in),
                            ('ticket', ticket_transformer.get_transformer, ticket_transformer.cols_in),
                            ('sex', sex_transformer.get_transformer, sex_transformer.cols_in), 
                            ('embarked',embarked_transformer.get_transformer,embarked_transformer.cols_in),
                            ('name', name_pipeline, ['Name']),
                            ],
                            remainder='passthrough')),
                        # ('to_array',FunctionTransformer(lambda x: x.A)),
                        ('scaler',StandardScaler()),
                        ('imputer',IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=20),max_iter=10,random_state=0,tol=1e-3)),
                        ('model',RandomForestClassifier())
                        ])  
RF_KNN_score=RF_KNN_pipeline.fit(X_train,y_train).score(X_test,y_test)
RF_KNN_score

0.8171641791044776

## On fait notre fichier de prédiction pour kaggle

In [26]:
test=pd.read_csv('../data/test.csv')
ypred=RF_KNN_pipeline.predict(test)
submit=pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':ypred})
submit.to_csv('/Users/lucascarpentier/submit.csv', index=False)

OSError: Cannot save file into a non-existent directory: '\Users\lucascarpentier'

## On repart juste sur le preprocess (sans le dernier élément du pipeline qui est le modèle), pour explorer l'importance des features

In [None]:
avant_model=GBC_pipeline[:-1].fit(X_train,y_train)
new_columns=get_feature_names_from_column_transformer(avant_model[0], X_train.columns.to_list())
train=to_pandas(avant_model.transform(X_train),new_columns)
test=to_pandas(avant_model.transform(X_test),new_columns)
train.head()

In [None]:
plot_correlation(pd.concat([y_train,train],axis=1),False)

In [None]:
#on choisit un modèle qui a l'attribut "feature_importances_"
mon_modele=GradientBoostingClassifier()
mon_modele.fit(train,y_train)
importances=mon_modele.feature_importances_
importances=dict(sorted(zip(train.columns,importances),key=lambda x:x[1],reverse=True))
plt.figure(figsize=(16,3))
plt.bar(importances.keys(),importances.values(),alpha=0.5)
plt.xticks(rotation=60)
plt.title(f"Importances, score du modèle sur le train : {mon_modele.score(test,y_test)}")
plt.show()



## Evolution de l'importance des features et du score en fonction de l'ensemble des colonnes retenues

In [None]:
#on peut décider de laisser tomber les features qui ont une importance en-deçà d'un certain seuil
new_cols=[col for col in train.columns if importances[col]>0.01]
n_importances={k:v for k,v in importances.items() if k in new_cols}
new_score=mon_modele.fit(train[new_cols],y_train).score(test[new_cols],y_test)
nn_importances=mon_modele.feature_importances_
nn_importances=dict(sorted(zip(new_cols,nn_importances),key=lambda x:x[1],reverse=True))
print(f"score en enlevant les features moins importantes ={new_score}")
plt.figure(figsize=(16,3))
plt.bar(n_importances.keys(),n_importances.values(),alpha=0.5,color='g',label='original')
plt.bar(nn_importances.keys(),nn_importances.values(),alpha=0.5,color='y',label="restricted")
plt.legend()
plt.xticks(rotation=60)
plt.title(f"Importances, score du modèle sur le test : {new_score}")
plt.show()

## Enregistrement de nouveaux csv pour test et train pour réutiliser ailleurs

In [None]:
ndata=data.drop(columns=["Survived"])
to_pandas(GBC_pipeline[:-1].transform(pd.read_csv('../data/test.csv')),new_columns).to_csv('ntest.csv',index=False)
ntrain=to_pandas(GBC_pipeline[:-1].transform(ndata),new_columns)
ntrain["Survived"]=data["Survived"]
ntrain.to_csv('ntrain.csv',index=False)
print(f"ntrain.shape={ntrain.shape}")
ntrain.head()

