In [6]:
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [7]:
def preprocessing_df(train):
    """
    Fonction permettant de faire le
    préprocessing d'une base de donnée
    en respectant toutes réalisée lors
    de l'élaboration des différents modèles
    """

    base_data = pd.read_csv("./groupe_2_train.csv")
    ## Regroupement des variables Region_Code
    ## et Policy_Sales_Channel
    sales = train["Policy_Sales_Channel"].value_counts()
    train["Policy_Sales_Channel"] = train["Policy_Sales_Channel"]\
        .apply(lambda x: 999 if x in sales[10:].index else x)
    
    region = train["Region_Code"].value_counts()
    train["Region_Code"] = train["Region_Code"]\
        .apply(lambda x: 888 if x in region[30:].index else x)
    
    ## Recodage de la variable Gender et Vehicle_Damage
    train["Gender"] = train["Gender"].replace({"Male":1, "Female":0})
    train["Vehicle_Damage"] = train["Vehicle_Damage"].replace({"Yes":1, "No":0})

    ## One Hot Encoding des variables Vehicle_Age
    ## Policy_Sales_Channel et Region_Code
    train= train.astype({"Region_Code":str})
    train = train.astype({"Policy_Sales_Channel": str})
    train = train.astype({"Vehicle_Age": str})
    train = pd.get_dummies(train)
    train = train.rename({"Vehicle_Age_< 1 Year": "Vehicle_Age_Inf 1 Year", 
                          "Vehicle_Age_> 2 Years" : "Vehicle_Age_Sup 2 Years"}, 
                          axis=1)
    
    var_quants = ["Age", "Annual_Premium", "Vintage"]
    ## Traitement des outliers par winsorization
    x1 = np.quantile(base_data["Age"], .98)
    x2 = np.quantile(base_data["Annual_Premium"], .97)
    x3 = np.quantile(base_data["Vintage"], .98)
    winsorize1 = lambda x: x1 if x > x1 else x
    winsorize2 = lambda x: x2 if x > x2 else x
    winsorize3 = lambda x: x3 if x > x3 else x

    train["Age"] = train["Age"].\
        apply(lambda x: winsorize1(x))
    train["Annual_Premium"] = train["Annual_Premium"].\
        apply(lambda x: winsorize2(x))
    train["Vintage"] = train["Vintage"].\
        apply(lambda x: winsorize3(x))

    ## Standardisation des variables quantitatives
    scale1, scale2, scale3 = StandardScaler().fit(base_data[["Age"]]),\
                             StandardScaler().fit(base_data[["Annual_Premium"]]),\
                             StandardScaler().fit(base_data[["Vintage"]])
    
    train["Age"] = scale1.transform(train[["Age"]])
    train["Annual_Premium"] = scale2.transform(train[["Annual_Premium"]])
    train["Vintage"] = scale3.transform(train[["Vintage"]]) 

    ## Réalisation de la feature selection
    selected_features = ["Age", "Annual_Premium", "Vintage",
                     "Gender", "Policy_Sales_Channel_26",
                     "Vehicle_Damage", "Policy_Sales_Channel_124",
                     "Region_Code_28", "Vehicle_Age_1-2 Year",
                     "Policy_Sales_Channel_152", "Region_Code_8",
                     "Policy_Sales_Channel_999", "Policy_Sales_Channel_157",
                     "Policy_Sales_Channel_154", "Policy_Sales_Channel_156",
                     "Region_Code_46", "Previously_Insured","Vehicle_Age_Sup 2 Years",
                     "Policy_Sales_Channel_160", "Region_Code_888"]
    
    train = train[selected_features]
    
    return train

In [8]:
def cross_sell(df):
    """
    Fonction permettant de
    prédire si une personne
    acceptera de souscrire
    ou non à une assurance auto

    df : La base de donnée entrée en
         argument doit avoir toutes
         les colonnes de la base test
         remise par le prof.
    """
    db = preprocessing_df(df)
    path = "./model_lgb.sav"
    model = joblib.load(path)

    pred = model.predict(db)
    dataF = pd.DataFrame(df["id"])
    dataF["Response"] = pd.DataFrame(pred)
    dataF.set_index("id", inplace=True)

    ## Creation du fichier csv
    dataF.to_csv("./groupe_2_prediction.csv")
    return dataF


In [9]:
df = pd.read_csv("./groupe_2_test.csv")

In [10]:
cross_sell(df)

Unnamed: 0_level_0,Response
id,Unnamed: 1_level_1
1,1
2,1
3,0
4,0
5,0
...,...
9149,0
9150,1
9151,1
9152,1
