In [1]:
import numpy as np
np.set_printoptions(threshold=10000,suppress=True)
import pandas as pd
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
#############################################################################
# Partie 1 : Apprentissage supervisé : Feature engineering et Classification#
#############################################################################

In [3]:
#########################
# Importer les données :#
 ########################
donnees = pd.read_excel("C:/Users/HP/Desktop/AssuranceData.xlsx")
donnees.head()

Unnamed: 0,Age,Revenu mensuel en euro,Cotisation annuelle en euros,Duree Contrat par jour,Type d Assurance,Profession,Situation Familiale,Client ou pas
0,44,2500,648.0,365,auto,Salarié,Marié,Oui
1,30,2100,1330.8,365,auto,Salarié,Marié,Oui
2,78,1500,441.6,365,auto,Retraité,Marié,Oui
3,30,2000,60.0,15,auto_temporaire,Salarié,Célibataire,Non
4,45,3000,1332.0,365,mutuelle,Cadre,Marié,Oui


In [4]:
donnees['Client ou pas'].value_counts(normalize=True)

Oui    0.574074
Non    0.425926
Name: Client ou pas, dtype: float64

In [5]:
# transformation des données vers array : 
X=donnees.values 

In [6]:
# séparer les features catégorielles et les features numériques.
col_cat=[4,5,6] # les colonnes catégorielles
col_num=[0,1,2,3] # les colonnes numériques
X_cat = np.copy(X[:, col_cat]) 
X_num = np.copy(X[:, col_num]) 

In [7]:
# normaliser les X_num :
SS=StandardScaler()
SS.fit(X_num)
X_num_normalisee=SS.transform(X_num) 
# Traitement de variables catégorielles :
X_cat_bin = OneHotEncoder().fit_transform(X_cat).toarray()

In [8]:
XX=np.concatenate((X_num_normalisee,X_cat_bin),axis=1)

In [9]:
Y=X[:,-1]

In [10]:
for i in range(len(Y)):
    if (Y[i]=='Non'):
        Y[i]=0
    else :
        Y[i]=1

In [11]:
Y=Y.astype('float')
Y.dtype

dtype('float64')

In [12]:
#Importer les librairies nécessaires pour faire nos algorithmes
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier , AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score,precision_score, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [13]:
#Séparer les données d’apprentissages des données de test pour ne pas avoir un résultat biaisé
Xtrain,Xtest,Ytrain,Ytest=train_test_split(XX,Y,random_state=1, test_size=0.5)

In [14]:
clfs = {
'RF': RandomForestClassifier(n_estimators=200, random_state=1),
'KNN': KNeighborsClassifier(n_neighbors=10),
'MLP': MLPClassifier(hidden_layer_sizes=(20,10), random_state=1),
'BGC': BaggingClassifier(n_estimators=200, random_state=1),
'AB': AdaBoostClassifier(n_estimators=200, random_state=1),
'AC': DecisionTreeClassifier(criterion='gini',random_state=1),
'AID3': DecisionTreeClassifier(criterion='entropy',random_state=1),
'NBS': GaussianNB(),
'LR': LogisticRegression()
}

In [15]:
from sklearn.model_selection import KFold, cross_validate # ,cross_val_score
kf = KFold(n_splits=10, shuffle=True, random_state=0)
import time

def run_classifiers(clfs,Xtrain,Ytrain):
    for i in clfs:
        clf = clfs[i]
        scores_accuracy = cross_val_score(clf, Xtrain, Ytrain, cv=5, scoring='accuracy')
        scores_precision = cross_val_score(clf, Xtrain, Ytrain, cv=5, scoring='precision')
        print("\n \n the score for {0} is: {1:.3f}".format(i, (np.mean(scores_accuracy)+np.mean(scores_precision))/2))

In [16]:
run_classifiers(clfs,Xtrain,Ytrain)


 
 the score for RF is: 0.957

 
 the score for KNN is: 0.721

 
 the score for MLP is: 0.747

 
 the score for BGC is: 0.951

 
 the score for AB is: 0.905

 
 the score for AC is: 0.944

 
 the score for AID3 is: 0.951

 
 the score for NBS is: 0.777

 
 the score for LR is: 0.744


In [17]:
# Paramétrage des classifieurs :
Notre_modele=RandomForestClassifier(random_state=1)
param_grid = {'n_estimators': [10, 50, 100, 200,300,400,500,600,700,800,900,1000],'criterion':('entropy','gini')}
grid=GridSearchCV(Notre_modele,param_grid,cv=5,scoring='accuracy')
grid.fit(Xtrain,Ytrain)
print(grid.best_params_)

{'criterion': 'entropy', 'n_estimators': 200}


In [18]:
Notre_modele=grid.best_estimator_

In [19]:
Notre_modele.fit(Xtrain,Ytrain)
YNotre_modele=Notre_modele.predict(Xtest)
print("the score is {0:.2f}%".format(((accuracy_score(Ytest,YNotre_modele)+precision_score(Ytest,YNotre_modele))/2)*100))

the score is 99.38%
