Prétraitement des données & formation sur les modèles

In [18]:
import numpy as np
import pandas as pd

X_train = pd.read_csv('../data_input/train.csv')
X_predict = pd.read_csv('../data_input/predict.csv')

1. Traitement les données en double

In [19]:
# Supprimer les lignes en double
X_train=X_train.drop_duplicates()
print(X_train.shape)

X_predict=X_predict.drop_duplicates()
print(X_predict.shape)

(58864, 129)
(500, 128)


2. Traitement de la valeur manquante

In [20]:
# Séparez les labels
y_train = X_train['Response']
del X_train['Response']

In [21]:
def description(df):
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    # Comptez le nombre de valeurs manquantes dans chaque colonne
    summary['Missing'] = df.isnull().sum().values    
    # Comptez le nombre de valeurs différentes dans chaque colonne, à l'exclusion des valeurs nulles
    summary['Uniques'] = df.nunique().values
    return summary

In [22]:
des_train = description(X_train)
des_train['Missing'] = des_train['Missing']/X_train.shape[0]
print(des_train[des_train['Missing']!=0].sort_values(by=['Missing'],ascending=False)[['Name','Missing']])

# si le taux manquant > 70%, nous supprimons cette caractère
cols = des_train[des_train['Missing']>=0.7]['Name']

print (cols)

X_train = X_train.drop(cols, axis = 1)

                   Name   Missing
48   Medical_History_10  0.990639
70   Medical_History_32  0.981313
62   Medical_History_24  0.935954
24        InsuredInfo_8  0.879162
53   Medical_History_15  0.750917
25        InsuredInfo_9  0.749354
38        Family_Hist_5  0.703979
36        Family_Hist_3  0.576651
35        Family_Hist_2  0.482689
30  Insurance_History_5  0.427715
37        Family_Hist_4  0.323203
16    Employment_Info_6  0.182658
39    Medical_History_1  0.149650
14    Employment_Info_4  0.114077
11    Employment_Info_1  0.000323
24         InsuredInfo_8
25         InsuredInfo_9
38         Family_Hist_5
48    Medical_History_10
53    Medical_History_15
62    Medical_History_24
70    Medical_History_32
Name: Name, dtype: object


In [23]:
print(X_train.shape)

(58864, 121)


In [24]:
# pour l'autre caractères manquante, nous remplaçons NaN par la valeur de Moyenne
cols = des_train[des_train['Missing']<0.7]['Name']
X_train[cols] = X_train[cols].fillna(X_train[cols].mean())

print(X_train.isnull().sum().values)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


In [25]:
des_predict = description(X_predict)

des_predict['Missing'] = des_predict['Missing']/X_predict.shape[0]
print(des_predict[des_predict['Missing']!=0].sort_values(by=['Missing'],ascending=False)[['Name','Missing']])

cols = des_predict[des_predict['Missing']>=0.7]['Name']
print ("*****Caractère supprimée*****")
print (cols)

X_predict = X_predict.drop(cols, axis = 1)

                   Name  Missing
48   Medical_History_10    0.988
70   Medical_History_32    0.986
62   Medical_History_24    0.942
24        InsuredInfo_8    0.902
53   Medical_History_15    0.756
25        InsuredInfo_9    0.742
38        Family_Hist_5    0.718
36        Family_Hist_3    0.578
35        Family_Hist_2    0.468
30  Insurance_History_5    0.432
37        Family_Hist_4    0.310
16    Employment_Info_6    0.202
39    Medical_History_1    0.160
14    Employment_Info_4    0.124
*****Caractère supprimée*****
24         InsuredInfo_8
25         InsuredInfo_9
38         Family_Hist_5
48    Medical_History_10
53    Medical_History_15
62    Medical_History_24
70    Medical_History_32
Name: Name, dtype: object


In [26]:
print(X_predict.shape)
# on laisse tomber la même caractères !!!

(500, 121)


In [27]:
cols = des_predict[des_predict['Missing']<0.7]['Name']
X_predict[cols] = X_predict[cols].fillna(X_predict[cols].mean())
print(X_predict.isnull().sum().values)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


3. Normalisation et standardisation des données

In [28]:
X_train.info()

des = description(X_train)
print(des[des['dtypes']=='object'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58864 entries, 0 to 58880
Columns: 121 entries, Product_Info_1 to Medical_Keyword_48
dtypes: float64(13), int64(106), object(2)
memory usage: 54.8+ MB
              Name  dtypes  Missing  Uniques
1   Product_Info_2  object        0       19
23   InsuredInfo_7  object        0        2


In [29]:
# Numérisation des caractères : one-hot code
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer(sparse=False)

X_train = vec.fit_transform(X_train.to_dict(orient='record'))

In [30]:
X_predict.info()

des = description(X_predict)
print(des[des['dtypes']=='object'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Columns: 121 entries, Product_Info_1 to Medical_Keyword_48
dtypes: float64(13), int64(106), object(2)
memory usage: 476.6+ KB
              Name  dtypes  Missing  Uniques
1   Product_Info_2  object        0       16
23   InsuredInfo_7  object        0        2


In [31]:
X_predict = vec.transform(X_predict.to_dict(orient='record'))

In [32]:
# Standardization des données

from sklearn.preprocessing import StandardScaler

ss= StandardScaler()
X_train = ss.fit_transform(X_train)
X_predict = ss.transform(X_predict)

4. Sélection des caractères

5. Formation de modèle

In [35]:
#Split data set
'''
from sklearn.model_selection import train_test_split

digits_data = X_train
digits_target = y_train

X_train,X_test,y_train,y_test = train_test_split(digits_data,digits_target,test_size=0.2,random_state=42)
'''

# Parce que le type de distribution label n'est pas uniforme,
# j'ai utilisé l'échantillonnage stratifié

digits_data = X_train
digits_target = y_train

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)

for train_index, test_index in split.split(digits_data, digits_target):
    X_train, X_test = digits_data[train_index], digits_data[test_index]
    y_train, y_test = digits_target[train_index], digits_target[test_index]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


PCA dimensionality reduction

In [None]:
from sklearn.decomposition import PCA

estimator = PCA(n_components=100)
pca_X_train = estimator.fit_transform(X_train)
pca_X_test = estimator.transform(X_test)

5.1 LinearSVC

In [78]:
from sklearn.svm import LinearSVC

lsvc = LinearSVC()
lsvc.fit(X_train,y_train)

y_predict = lsvc.predict(X_test)

print(lsvc.score(X_test,y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test,y_predict))

In [None]:
lsvc.fit(pca_X_train,y_train)

pca_y_predict = lsvc.predict(pca_X_test)

print(lsvc.score(pca_X_test,y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test,pca_y_predict))

5.2 GBDT

In [80]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_predict = gbc.predict(X_test)

print(gbc.score(X_test, y_test))

from sklearn.metrics import classification_report
 
print(classification_report(y_test, gbc_y_predict))

0.5454792473337409
              precision    recall  f1-score   support

           1       0.51      0.24      0.33      1540
           2       0.46      0.26      0.33      1624
           3       0.19      0.03      0.05       252
           4       0.35      0.12      0.18       355
           5       0.57      0.56      0.57      1347
           6       0.41      0.52      0.46      2784
           7       0.45      0.35      0.39      1987
           8       0.66      0.89      0.75      4832

    accuracy                           0.55     14721
   macro avg       0.45      0.37      0.38     14721
weighted avg       0.52      0.55      0.52     14721



In [22]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(pca_X_train, y_train)
gbc_y_predict = gbc.predict(pca_X_test)

print(gbc.score(pca_X_test, y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test, gbc_y_predict))

0.4677761738982763
              precision    recall  f1-score   support

           1       0.40      0.26      0.31      1241
           2       0.38      0.21      0.27      1336
           3       0.11      0.02      0.03       206
           4       0.17      0.03      0.05       279
           5       0.45      0.28      0.35      1060
           6       0.33      0.41      0.37      2186
           7       0.35      0.21      0.26      1575
           8       0.58      0.87      0.69      3894

    accuracy                           0.47     11777
   macro avg       0.35      0.28      0.29     11777
weighted avg       0.43      0.47      0.43     11777

