In [37]:
import sklearn
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
# ^^^ pyforest auto-imports - don't write above this line
import pyforest
import os
import warnings
import sklearn

#preparation des données
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

#modeles :
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier, LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier

#mesure performance : 
from sklearn import metrics
warnings.filterwarnings("ignore")

In [2]:
# Lecture des données :
path1 = [i for i in os.getcwd().split("\\")]
path1

['D:',
 'etude_data_science',
 'Kaggle_competition',
 '02_Binary_Prediction_of_Smoker_Status_using_Bio-Signals',
 'notebook']

In [5]:
path2 = ("\\").join(path1[:-1]) + "\\dataset"

In [6]:
os.listdir(path2)

['data_cleaned.csv',
 'playground-series-s3e24.zip',
 'sample_submission.csv',
 'test.csv',
 'train.csv']

In [7]:
data = pd.read_csv(path2+"\\data_cleaned.csv", index_col = 0)
df = data.copy()
df.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,hearing(left),hearing(right),Urine protein,dental caries,smoking
0,-3.606746,0.755803,-1.218238,-0.401488,-0.42785,-0.212524,0.734177,0.772676,-1.172399,0.307489,-0.006818,0.700179,1,1,1,0,1
1,-1.181505,4.401603,-2.490884,-1.117876,0.384461,3.960486,-6.225314,2.27448,-1.635772,3.977569,0.548055,1.063149,2,2,1,1,0
2,-2.731105,-1.048645,-0.828711,0.055921,0.090602,1.31663,1.122125,0.630216,0.379617,-0.711981,0.08553,0.998121,1,1,1,0,1
3,-1.369274,1.904712,0.788166,-1.374263,-0.530577,-0.64603,0.669578,0.086063,-0.155953,-0.352081,0.210692,0.88949,1,1,1,1,0
4,1.614225,-1.95846,-0.953814,-0.379936,0.165742,-0.792837,1.132931,1.293741,-0.738215,-0.599433,0.329744,-0.55903,1,1,1,0,1


In [8]:
# Récupération d'un sample du dataframe pour effectuer les différents test de modélisation

#La colonne sera ensuite mute une fois les tests terminés pour utiliser l'ensemble du dataframe
df = df.sample(1000)

In [21]:
X,y = df.drop("smoking", axis = 1), df["smoking"]

In [22]:
X_train,X_test,y_train, y_test = train_test_split(X,y, random_state=42,
                                                 test_size=0.2)

# Plan : 

## 1. Création de fonction de modélisation et  Préparation des modèles

## 2. Entrainement des modèles

## 3. Resultats

## 4. Selection des modèles et amélioration par GridSearchCV

## 5. Visualisation des performances

# 1. Création des fonctions de modélisation et préparation des modèles

In [18]:
all_model = [DummyClassifier(),
             LogisticRegression(),
             KNeighborsClassifier(),
             SGDClassifier(),
             SVC(),
             RandomForestClassifier()
            ]

In [50]:
### Création d'une fonction d'entrainement pour un modèle donné

def train_model(model):
    """le modèle sera entrainé par validation score sur 5 split
    et affichage du score moyen.
    la fonction renvoi les prédiction du modèle entrainé"""
    scores = (cross_val_score(estimator=model, X=X_train,y=y_train, cv=5)).mean()
    print("Score moyen validation croisée :", scores.round(2))
    model.fit(X_train,y_train)
    yp = model.predict(X_test)
    return yp

In [31]:
### Création d'une fonction qui renvoi l'ensemble des metrics :

def metric(prediction, model_name):
    """Fonction qui renvoi un dataframe contenant les valeurs des metrics
    principales : f1score, accuracy, recall et precision score
    à partir de la liste des prédictions d'un modèle"""
    
    f1 = metrics.f1_score(y_true = y_test, y_pred = prediction)
    accuracy = metrics.accuracy_score(y_true = y_test, y_pred = prediction)
    recal = metrics.recall_score(y_true = y_test, y_pred = prediction)
    precision = metrics.precision_score(y_true = y_test, y_pred = prediction)
    all_metric = [f1,accuracy,recal,precision]
    
    index_metric = ["F1","accuracy","recall","precision"]
    
    series = pd.Series(all_metric, name=str(model_name), index=index_metric)
    return pd.DataFrame(series)
    
    
    

In [32]:
### Fonction permettant de merge de dataframe :
def merge(df1, df2):
    return pd.merge(df1, df2, left_index=True, right_index=True)

In [None]:
### fonction pour créer une matrice de confusion : 

def matrice_confusion(prediction):
    cf = metrics.confusion_matrix(y_true=y_test, y_pred=prediction)
    plt.figure(figsize=(4,4))
    ax = sns.heatmap(cf, annot = True, linewidths=0.8, linecolor="black", fmt = ".0f",cbar=False, cmap = "Blues")
    ax.set_xlabel('Prédictions')
    ax.set_ylabel('Valeurs réelles')
    plt.title("Confusion Matrix")
    plt.show()


In [42]:
### Fonction pour obtenir la learning curve d'un modèle : 
def learning_curv(model):
    train_sizes, train_scores, test_scores = learning_curve(model,
                                                                X_train, y_train, cv=5)
    
# Calcul des scores moyens pour l'apprentissage et la validation
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

# Tracer les courbes d'apprentissage
    plt.figure(figsize=(4, 3))
    plt.plot(train_sizes, train_scores_mean, label='Score d\'entraînement moyen')
    plt.plot(train_sizes, test_scores_mean, label='Score de validation moyen')
    plt.xlabel('Taille de l\'ensemble d\'entraînement')
    plt.ylabel('Score')
    plt.title('Courbes d\'apprentissage')
    plt.legend()
    plt.show()
    


In [44]:
### Fonction pour obtenir la courbe ROC : 

def ROC(prediction):
    FP_rate, TP_rate, thresholds = metrics.roc_curve(y_test,prediction)
    roc_auc = metrics.auc(FP_rate,TP_rate)
    print("ROC_AUC =", roc_auc.round(2))
    
    
    ### Création de la courbe AUC ROC :
    plt.figure(figsize=(6,6))
    
    #Courbe prédictive :
    sns.lineplot(x = FP_rate, y = TP_rate, color = "orange", label = f"AUC = {roc_auc:,.2f}%")
    
    #Courbe random :
    sns.lineplot(x=[0,1],y=[0,1], linestyle = "--", c = "r", label= "Random Classifier")

    plt.ylabel("True Positive rate")
    plt.xlabel("False Positive rate")
    plt.title("ROC")
    plt.legend(loc = "upper left")
    plt.show()