In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

#preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve, cross_val_score

#modelisation
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier

#metrics
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve 
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import learning_curve

import os

In [None]:
p1 = r'D:/Kaggle_competition/01_Spaceship_titanic'
p2 = os.listdir(str(p1+"/"+os.listdir(p1)[0]))[0]
path =str(p1+"/"+os.listdir(p1)[0])+'/'+p2

In [None]:
data = pd.read_csv(path, index_col=0)
data

In [None]:
df = data.copy()
df.reset_index(inplace=True)
df.drop(columns="index", inplace = True)

In [None]:
X,y = df.drop("Transported", axis = 1), df["Transported"]
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
#Fonction qui permet d'entrainer un modèle et de récupérer les prédiction
def train_model(model):
    model.fit(X_train,y_train)
    yp = model.predict(X_test)
    return yp

In [None]:
#Fonction permettant de récupérer un dataframe contenant les metrics pour un modèle donné :
def metric(pred, model_name):
    f1 = f1_score(y_true=y_test,y_pred=pred)
    acc = accuracy_score(y_true=y_test,y_pred=pred)
    rec = recall_score(y_true=y_test,y_pred=pred)
    prec = precision_score(y_true=y_test,y_pred=pred)
    ind = ["F1","Accuracy","Recall","Precision"]
    serie = pd.Series([f1,acc,rec,prec], name = str(model_name), index=ind)
    return pd.DataFrame(serie)#, index=ind)


In [None]:
#Pour merge les dataframe :
def merge(df1, df2):
    return pd.merge(df1, df2, left_index=True, right_index=True)

In [None]:
def cf(pred):
    cf = confusion_matrix(y_true=y_test, y_pred=pred)
    ax = sns.heatmap(cf, annot = True, linewidths=0.8, linecolor="black", fmt = ".0f",cbar=False, cmap = "Blues")
    ax.set_xlabel('Prédictions')
    ax.set_ylabel('Valeurs réelles')
    plt.title("Confusion Matrix")
    plt.show()
    


In [None]:
def learning_c(model):
# Obtention des scores d'apprentissage et de validation pour différentes tailles d'ensemble d'entraînement
    train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train,
                                                            train_sizes=np.linspace(0.1, 1.0, 10), cv=5)

# Calcul des scores moyens pour l'apprentissage et la validation
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

# Tracer les courbes d'apprentissage
    plt.figure(figsize=(8, 6))
    plt.plot(train_sizes, train_scores_mean, label='Score d\'entraînement moyen')
    plt.plot(train_sizes, test_scores_mean, label='Score de validation moyen')
    plt.xlabel('Taille de l\'ensemble d\'entraînement')
    plt.ylabel('Score')
    plt.title('Courbes d\'apprentissage')
    plt.legend()
    plt.show()


In [None]:
#Recuperation des prediction du dummy :
dummy = DummyClassifier(strategy="most_frequent",random_state=42)
yp_dummy = train_model(dummy)
#Création du dataframe contenant les metrics de dummy :
df_dummy = metric(yp_dummy, "Dummy")

In [None]:
cf(ypred_dummy)

In [None]:
learning_c(dummy)

In [None]:
svc = SVC(random_state=42)
lr = LogisticRegression(random_state=42)
sgdc = SGDClassifier(random_state=42)

In [None]:
#Recuperation des prediction du dummy :
yp_svc = train_model(svc)
#Création du dataframe contenant les metrics de dummy :
df_svc = metric(yp_svc, "SVC")
#Fusion des resultats obtenu avec dummy et svc :
merged_metrics = merge(df_dummy, df_svc)
cf(yp_svc)

In [None]:
learning_c(svc)

In [None]:
#Recuperation des prediction  :
yp_lr = train_model(lr)
#Création du dataframe contenant les metrics  :
df_lr = metric(yp_lr, "LogisticRegression")
#Fusion des resultats obtenu avec le précédent merge :
merged_metrics = merge(merged_metrics, df_lr)
cf(yp_lr)

In [None]:
learning_c(lr)

In [None]:
#Recuperation des prediction  :
yp_sgdc = train_model(sgdc)
#Création du dataframe contenant les metrics  :
df_sgdc = metric(yp_sgdc, "SGDClassifier")
#Fusion des resultats obtenu avec le précédent merge :
merged_metrics = merge(merged_metrics, df_sgdc)
cf(yp_sgdc)

In [None]:
learning_c(sgdc)

In [None]:
merged_metrics

#### Cross validation 

In [None]:
cross_val_score(svc, X_train,y_train )