Prétraitement des données & formation sur les modèles

In [1]:
import numpy as np
import pandas as pd

X_train = pd.read_csv('../data_input/train.csv')
X_predict = pd.read_csv('../data_input/predict.csv')

1. Traitement les données en double

In [2]:
# Supprimer les lignes en double
X_train = X_train.drop_duplicates()

print(X_train.shape)

(58864, 129)


2. Resampling

In [3]:
from sklearn.utils import resample
from sklearn.utils import resample

# Séparation des catégories majoritaires et minoritaires

X_train_minority = X_train[(X_train.Response == 3) | (X_train.Response == 4)]
X_train_majority = X_train[(X_train.Response == 1) | (X_train.Response == 2) |
                           (X_train.Response == 5) | (X_train.Response == 6) |
                           (X_train.Response == 7) | (X_train.Response == 8)]

print(X_train.Response.value_counts())

# Suréchantillonnage des catégories minoritaires
X_train_minority_upsampled = resample(X_train_minority, replace=True,
                                      n_samples=12000,
                                      random_state=42)

# Fusionner la plupart des catégories et quelques catégories suréchantillonnées
X_train_upsampled = pd.concat([X_train_majority, X_train_minority_upsampled])

# Afficher le nouveau numéro de catégorie
print(X_train_upsampled.Response.value_counts())

X_train = X_train_upsampled

8    19318
6    11134
7     7944
2     6494
1     6157
5     5389
4     1419
3     1009
Name: Response, dtype: int64
8    19318
6    11134
7     7944
4     7062
2     6494
1     6157
5     5389
3     4938
Name: Response, dtype: int64


3. combiner train et predict

In [4]:
all_data = X_train.append(X_predict,sort=False)

all_data['Response'].fillna(-1, inplace=True)

# corrige le dtype sur la colonne label
all_data['Response'] = all_data['Response'].astype(int)

4. Traitement de la valeur manquante

In [5]:
def description(df):
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    # Comptez le nombre de valeurs manquantes dans chaque colonne
    summary['Missing'] = df.isnull().sum().values    
    # Comptez le nombre de valeurs différentes dans chaque colonne, à l'exclusion des valeurs nulles
    summary['Uniques'] = df.nunique().values
    return summary

In [6]:
des_all_data = description(all_data)
des_all_data['Missing'] = des_all_data['Missing']/all_data.shape[0]
print(des_all_data[des_all_data['Missing']!=0].sort_values(by=['Missing'],ascending=False)[['Name','Missing']])

# si le taux manquant > 50%, nous supprimons cette caractère
cols = des_all_data[des_all_data['Missing'] >= 0.5]['Name']
print(cols)

all_data = all_data.drop(cols, axis=1)

# pour l'autre caractères manquante, nous remplaçons NaN par la valeur de Moyenne / Median
cols = des_all_data[des_all_data['Missing'] < 0.5]['Name']

# all_data[cols] = all_data[cols].fillna(all_data[cols].mean())

all_data[cols] = all_data[cols].fillna(all_data[cols].median())

print(all_data.isnull().sum().values)

                   Name   Missing
48   Medical_History_10  0.991006
70   Medical_History_32  0.976224
62   Medical_History_24  0.934417
24        InsuredInfo_8  0.881006
25        InsuredInfo_9  0.747911
38        Family_Hist_5  0.712356
53   Medical_History_15  0.661759
36        Family_Hist_3  0.587269
35        Family_Hist_2  0.472960
30  Insurance_History_5  0.441482
37        Family_Hist_4  0.315423
16    Employment_Info_6  0.189901
39    Medical_History_1  0.160381
14    Employment_Info_4  0.111872
11    Employment_Info_1  0.000276
24         InsuredInfo_8
25         InsuredInfo_9
38         Family_Hist_5
48    Medical_History_10
62    Medical_History_24
70    Medical_History_32
Name: Name, dtype: object
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0]


In [7]:
print(all_data.shape)

(68936, 123)


5. create new variables

In [8]:
all_data['Product_Info_2_char'] = all_data.Product_Info_2.str[0]
all_data['Product_Info_2_num'] = all_data.Product_Info_2.str[1]
all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age']
med_keyword_columns = all_data.columns[all_data.columns.str.startswith('Medical_Keyword_')]
all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1)

6. Standardization des données

In [9]:
# variables_discrete = ['Medical_History_1', 'Medical_History_10', 'Medical_History_15',
#                       'Medical_History_24', 'Medical_History_32']

# Seule cette variable est laissée 'Medical_History_1'

from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
mh1_scale_param = ss.fit(all_data['Medical_History_1'].values.reshape(-1, 1))
all_data['Medical_History_1'] = ss.fit_transform(all_data['Medical_History_1'].values.reshape(-1, 1), mh1_scale_param)


7. Numérisation des données

In [10]:
all_data.info()

des_all_data = description(all_data)
print(des_all_data[des_all_data['dtypes']=='object'])


<class 'pandas.core.frame.DataFrame'>
Int64Index: 68936 entries, 0 to 499
Columns: 127 entries, Product_Info_1 to Med_Keywords_Count
dtypes: float64(15), int64(108), object(4)
memory usage: 67.3+ MB
                    Name  dtypes  Missing  Uniques
1         Product_Info_2  object        0       19
23         InsuredInfo_7  object        0        2
123  Product_Info_2_char  object        0        5
124   Product_Info_2_num  object        0        8


In [11]:
# Numérisation des caractères : one-hot code

from sklearn.preprocessing import LabelEncoder

label_encode = LabelEncoder()
all_data['Product_Info_2'] = label_encode.fit_transform(all_data['Product_Info_2'])
all_data['InsuredInfo_7'] = label_encode.fit_transform(all_data['InsuredInfo_7'])
all_data['Product_Info_2_char'] = label_encode.fit_transform(all_data['Product_Info_2_char'])
all_data['Product_Info_2_num'] = label_encode.fit_transform(all_data['Product_Info_2_num'])


8. PCA dimensionality reduction

In [12]:
# from sklearn.decomposition import PCA

# pca_all_data = all_data
# y_pca = pca_all_data['Response']
# del pca_all_data['Response']

# estimator = PCA(n_components='mle')
# pca_all_data = estimator.fit_transform(pca_all_data)

# print(estimator.n_components_)

# print(estimator.explained_variance_ratio_)

# pca_all_data = np.insert(pca_all_data, 125, values=y_pca, axis=1)

# all_data = pd.DataFrame(pca_all_data).rename(columns={125: 'Response'})

9. Sélection des caractères

In [13]:
from sklearn import feature_selection

# si le variance < 0.005 , nous supprimons cette caractère
sele = feature_selection.VarianceThreshold(threshold=0.005)

fs_all_data = all_data
y_pca = fs_all_data['Response']
del fs_all_data['Response']

fs_all_data = sele.fit_transform(fs_all_data)

fs_all_data = np.insert(fs_all_data, 121, values=y_pca, axis=1)

all_data = pd.DataFrame(fs_all_data).rename(columns={121: 'Response'})

10. Partitionnement de l'ensemble de données

In [14]:
all_data = pd.DataFrame(all_data)

# split train and test
train = all_data[all_data['Response'] > 0].copy()
predict = all_data[all_data['Response'] < 0].copy()

digits_data = train
digits_target = train['Response']
del digits_data['Response']

# Split data set
# Parce que le type de distribution label n'est pas uniforme,
# j'ai utilisé l'échantillonnage stratifié


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits_data, digits_target, test_size=0.25,
                                                    random_state=42, stratify=digits_target)

11. Formation de modèle

LinearSVC

In [15]:
from sklearn.svm import LinearSVC

lsvc = LinearSVC()
lsvc.fit(X_train,y_train)

y_predict = lsvc.predict(X_test)

print(lsvc.score(X_test,y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test,y_predict))


0.43380676836752585
              precision    recall  f1-score   support

         1.0       0.39      0.13      0.19      1539
         2.0       0.25      0.49      0.33      1624
         3.0       0.47      0.29      0.36      1234
         4.0       0.40      0.75      0.52      1766
         5.0       0.39      0.01      0.02      1347
         6.0       0.32      0.01      0.02      2784
         7.0       0.29      0.33      0.31      1986
         8.0       0.58      0.84      0.68      4829

    accuracy                           0.43     17109
   macro avg       0.39      0.36      0.30     17109
weighted avg       0.41      0.43      0.36     17109





RandomForest

In [16]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_predict = rfc.predict(X_test)

print(rfc.score(X_test, y_test))
from sklearn.metrics import classification_report
print(classification_report(y_test, rfc_y_predict))



0.5893389444152201
              precision    recall  f1-score   support

         1.0       0.29      0.22      0.25      1539
         2.0       0.30      0.23      0.26      1624
         3.0       0.92      0.99      0.95      1234
         4.0       0.93      0.98      0.95      1766
         5.0       0.52      0.46      0.49      1347
         6.0       0.42      0.45      0.43      2784
         7.0       0.39      0.29      0.33      1986
         8.0       0.68      0.82      0.74      4829

    accuracy                           0.59     17109
   macro avg       0.56      0.56      0.55     17109
weighted avg       0.56      0.59      0.57     17109



GBDT

In [17]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_predict = gbc.predict(X_test)

print(gbc.score(X_test, y_test))

from sklearn.metrics import classification_report
 
print(classification_report(y_test, gbc_y_predict))

0.5933719095213046
              precision    recall  f1-score   support

         1.0       0.46      0.19      0.27      1539
         2.0       0.47      0.24      0.32      1624
         3.0       0.82      0.55      0.66      1234
         4.0       0.74      0.78      0.76      1766
         5.0       0.58      0.55      0.57      1347
         6.0       0.48      0.53      0.50      2784
         7.0       0.45      0.41      0.43      1986
         8.0       0.65      0.90      0.75      4829

    accuracy                           0.59     17109
   macro avg       0.58      0.52      0.53     17109
weighted avg       0.58      0.59      0.57     17109

