Prétraitement des données & formation sur les modèles

In [1]:
import numpy as np
import pandas as pd

X_train = pd.read_csv('../data_input/train.csv')
X_predict = pd.read_csv('../data_input/predict.csv')

1. Traitement les données en double

In [2]:
# Supprimer les lignes en double
X_train=X_train.drop_duplicates()
print(X_train.shape)

X_predict=X_predict.drop_duplicates()
print(X_predict.shape)

(58864, 129)
(500, 128)


2. Traitement de la valeur manquante

In [3]:
# Séparez les labels
y_train = X_train['Response']
del X_train['Response']

In [4]:
def description(df):
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    # Comptez le nombre de valeurs manquantes dans chaque colonne
    summary['Missing'] = df.isnull().sum().values    
    # Comptez le nombre de valeurs différentes dans chaque colonne, à l'exclusion des valeurs nulles
    summary['Uniques'] = df.nunique().values
    return summary

In [5]:
des_train = description(X_train)
des_train['Missing'] = des_train['Missing']/X_train.shape[0]
print(des_train[des_train['Missing']!=0].sort_values(by=['Missing'],ascending=False)[['Name','Missing']])

# si le taux manquant > 70%, nous supprimons cette caractère
cols = des_train[des_train['Missing']>=0.7]['Name']

print (cols)

X_train = X_train.drop(cols, axis = 1)

                   Name   Missing
48   Medical_History_10  0.990639
70   Medical_History_32  0.981313
62   Medical_History_24  0.935954
24        InsuredInfo_8  0.879162
53   Medical_History_15  0.750917
25        InsuredInfo_9  0.749354
38        Family_Hist_5  0.703979
36        Family_Hist_3  0.576651
35        Family_Hist_2  0.482689
30  Insurance_History_5  0.427715
37        Family_Hist_4  0.323203
16    Employment_Info_6  0.182658
39    Medical_History_1  0.149650
14    Employment_Info_4  0.114077
11    Employment_Info_1  0.000323
24         InsuredInfo_8
25         InsuredInfo_9
38         Family_Hist_5
48    Medical_History_10
53    Medical_History_15
62    Medical_History_24
70    Medical_History_32
Name: Name, dtype: object


In [6]:
print(X_train.shape)

(58864, 121)


In [7]:
# pour l'autre caractères manquante, nous remplaçons NaN par la valeur de Moyenne
cols = des_train[des_train['Missing']<0.7]['Name']
X_train[cols] = X_train[cols].fillna(X_train[cols].mean())

print(X_train.isnull().sum().values)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


In [8]:
des_predict = description(X_predict)

des_predict['Missing'] = des_predict['Missing']/X_predict.shape[0]
print(des_predict[des_predict['Missing']!=0].sort_values(by=['Missing'],ascending=False)[['Name','Missing']])

cols = des_predict[des_predict['Missing']>=0.7]['Name']
print ("*****Caractère supprimée*****")
print (cols)

X_predict = X_predict.drop(cols, axis = 1)

                   Name  Missing
48   Medical_History_10    0.988
70   Medical_History_32    0.986
62   Medical_History_24    0.942
24        InsuredInfo_8    0.902
53   Medical_History_15    0.756
25        InsuredInfo_9    0.742
38        Family_Hist_5    0.718
36        Family_Hist_3    0.578
35        Family_Hist_2    0.468
30  Insurance_History_5    0.432
37        Family_Hist_4    0.310
16    Employment_Info_6    0.202
39    Medical_History_1    0.160
14    Employment_Info_4    0.124
*****Caractère supprimée*****
24         InsuredInfo_8
25         InsuredInfo_9
38         Family_Hist_5
48    Medical_History_10
53    Medical_History_15
62    Medical_History_24
70    Medical_History_32
Name: Name, dtype: object


In [9]:
print(X_predict.shape)
# on laisse tomber la même caractères !!!

(500, 121)


In [10]:
cols = des_predict[des_predict['Missing']<0.7]['Name']
X_predict[cols] = X_predict[cols].fillna(X_predict[cols].mean())
print(X_predict.isnull().sum().values)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


3. Numérisation des données

In [11]:
X_train.info()

des = description(X_train)
print(des[des['dtypes']=='object'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58864 entries, 0 to 58880
Columns: 121 entries, Product_Info_1 to Medical_Keyword_48
dtypes: float64(13), int64(106), object(2)
memory usage: 54.8+ MB
              Name  dtypes  Missing  Uniques
1   Product_Info_2  object        0       19
23   InsuredInfo_7  object        0        2


In [12]:
# Numérisation des caractères : one-hot code
# from sklearn.feature_extraction import DictVectorizer

# vec = DictVectorizer(sparse=False)

# X_train = vec.fit_transform(X_train.to_dict(orient='record'))

from sklearn.preprocessing import LabelEncoder
label_encode = LabelEncoder()
X_train['Product_Info_2'] = label_encode.fit_transform(X_train['Product_Info_2'])
X_train['InsuredInfo_7'] = label_encode.fit_transform(X_train['InsuredInfo_7'])

from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(categorical_features=[1,23],sparse=False)

X_train = one_hot.fit_transform(X_train)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [13]:
X_predict.info()

des = description(X_predict)
print(des[des['dtypes']=='object'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Columns: 121 entries, Product_Info_1 to Medical_Keyword_48
dtypes: float64(13), int64(106), object(2)
memory usage: 476.6+ KB
              Name  dtypes  Missing  Uniques
1   Product_Info_2  object        0       16
23   InsuredInfo_7  object        0        2


In [14]:
# X_predict = vec.transform(X_predict.to_dict(orient='record'))

X_predict['Product_Info_2'] = label_encode.fit_transform(X_predict['Product_Info_2'])
X_predict['InsuredInfo_7'] = label_encode.fit_transform(X_predict['InsuredInfo_7'])

X_predict = one_hot.fit_transform(X_predict)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


4. Sélection des caractères

In [None]:
from sklearn import feature_selection

# si le variance < 0.005 , nous supprimons cette caractère
sele = feature_selection.VarianceThreshold(threshold=0.005)

X_train = sele.fit_transform(X_train)

5. Standardisation des données

In [15]:
# Standardization des données

from sklearn.preprocessing import StandardScaler

ss= StandardScaler()
X_train = ss.fit_transform(X_train)
X_predict = ss.fit_transform(X_predict)


6. Formation de modèle

In [16]:
#Split data set
# Parce que le type de distribution label n'est pas uniforme,
# j'ai utilisé l'échantillonnage stratifié


from sklearn.model_selection import train_test_split

digits_data = X_train
digits_target = y_train

X_train, X_test, y_train, y_test = train_test_split(digits_data, digits_target ,
                                                    test_size=0.2, random_state=42, stratify=digits_target)

PCA dimensionality reduction

In [17]:
'''
from sklearn.decomposition import PCA

estimator = PCA(n_components=100)
pca_X_train = estimator.fit_transform(X_train)
pca_X_test = estimator.transform(X_test)
'''

'\nfrom sklearn.decomposition import PCA\n\nestimator = PCA(n_components=100)\npca_X_train = estimator.fit_transform(X_train)\npca_X_test = estimator.transform(X_test)\n'

5.1 LinearSVC

In [18]:
'''
from sklearn.svm import LinearSVC

lsvc = LinearSVC()
lsvc.fit(X_train,y_train)

y_predict = lsvc.predict(X_test)

print(lsvc.score(X_test,y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test,y_predict))
'''

'\nfrom sklearn.svm import LinearSVC\n\nlsvc = LinearSVC()\nlsvc.fit(X_train,y_train)\n\ny_predict = lsvc.predict(X_test)\n\nprint(lsvc.score(X_test,y_test))\n\nfrom sklearn.metrics import classification_report\n\nprint(classification_report(y_test,y_predict))\n'

In [19]:
'''
lsvc.fit(pca_X_train,y_train)

pca_y_predict = lsvc.predict(pca_X_test)

print(lsvc.score(pca_X_test,y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test,pca_y_predict))
'''

'\nlsvc.fit(pca_X_train,y_train)\n\npca_y_predict = lsvc.predict(pca_X_test)\n\nprint(lsvc.score(pca_X_test,y_test))\n\nfrom sklearn.metrics import classification_report\n\nprint(classification_report(y_test,pca_y_predict))\n'

5.2 GBDT

In [20]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_predict = gbc.predict(X_test)

print(gbc.score(X_test, y_test))

from sklearn.metrics import classification_report
 
print(classification_report(y_test, gbc_y_predict))

0.543531810073898
              precision    recall  f1-score   support

           1       0.49      0.23      0.31      1231
           2       0.48      0.26      0.34      1299
           3       0.27      0.04      0.08       202
           4       0.35      0.13      0.19       284
           5       0.56      0.57      0.57      1078
           6       0.41      0.53      0.46      2227
           7       0.46      0.36      0.40      1589
           8       0.66      0.88      0.75      3863

    accuracy                           0.54     11773
   macro avg       0.46      0.37      0.39     11773
weighted avg       0.52      0.54      0.51     11773



In [21]:
'''
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(pca_X_train, y_train)
gbc_y_predict = gbc.predict(pca_X_test)

print(gbc.score(pca_X_test, y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test, gbc_y_predict))
'''

'\nfrom sklearn.ensemble import GradientBoostingClassifier\ngbc = GradientBoostingClassifier()\ngbc.fit(pca_X_train, y_train)\ngbc_y_predict = gbc.predict(pca_X_test)\n\nprint(gbc.score(pca_X_test, y_test))\n\nfrom sklearn.metrics import classification_report\n\nprint(classification_report(y_test, gbc_y_predict))\n'