Prétraitement des données & formation sur les modèles

In [1]:
import numpy as np
import pandas as pd

X_train = pd.read_csv('../data_input/train.csv')
X_predict = pd.read_csv('../data_input/predict.csv')

# combiner train et predict
all_data = X_train.append(X_predict,sort=None)

all_data['Response'].fillna(-1, inplace=True)

# corrige le dtype sur la colonne label
all_data['Response'] = all_data['Response'].astype(int)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


1. Traitement les données en double

In [2]:
# Supprimer les lignes en double
all_data = all_data.drop_duplicates()

print(all_data.shape)

(59364, 129)


2. Traitement de la valeur manquante

In [3]:
def description(df):
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    # Comptez le nombre de valeurs manquantes dans chaque colonne
    summary['Missing'] = df.isnull().sum().values    
    # Comptez le nombre de valeurs différentes dans chaque colonne, à l'exclusion des valeurs nulles
    summary['Uniques'] = df.nunique().values
    return summary

In [4]:
des_all_data = description(all_data)
des_all_data['Missing'] = des_all_data['Missing']/all_data.shape[0]
print(des_all_data[des_all_data['Missing']!=0].sort_values(by=['Missing'],ascending=False)[['Name','Missing']])

# si le taux manquant > 70%, nous supprimons cette caractère
cols = des_all_data[des_all_data['Missing'] >= 0.7]['Name']
print(cols)

all_data = all_data.drop(cols, axis=1)

# pour l'autre caractères manquante, nous remplaçons NaN par la valeur de Moyenne / Median
cols = des_all_data[des_all_data['Missing'] < 0.7]['Name']

# all_data[cols] = all_data[cols].fillna(all_data[cols].mean())
all_data[cols] = all_data[cols].fillna(all_data[cols].median())

print(all_data.isnull().sum().values)

                   Name   Missing
32   Medical_History_10  0.990617
56   Medical_History_32  0.981352
47   Medical_History_24  0.936005
29        InsuredInfo_8  0.879354
37   Medical_History_15  0.750960
30        InsuredInfo_9  0.749293
11        Family_Hist_5  0.704097
9         Family_Hist_3  0.576663
8         Family_Hist_2  0.482565
18  Insurance_History_5  0.427751
10        Family_Hist_4  0.323091
6     Employment_Info_6  0.182821
31    Medical_History_1  0.149737
4     Employment_Info_4  0.114160
1     Employment_Info_1  0.000320
11         Family_Hist_5
29         InsuredInfo_8
30         InsuredInfo_9
32    Medical_History_10
37    Medical_History_15
47    Medical_History_24
56    Medical_History_32
Name: Name, dtype: object
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0]


In [5]:
print(all_data.shape)

(59364, 122)


3. create any new variables

In [6]:
all_data['Product_Info_2_char'] = all_data.Product_Info_2.str[0]
all_data['Product_Info_2_num'] = all_data.Product_Info_2.str[1]
all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age']
med_keyword_columns = all_data.columns[all_data.columns.str.startswith('Medical_Keyword_')]
all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1)

4. Numérisation des données

In [7]:
all_data.info()

des_all_data = description(all_data)
print(des_all_data[des_all_data['dtypes']=='object'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59364 entries, 0 to 499
Columns: 126 entries, BMI to Med_Keywords_Count
dtypes: float64(14), int64(108), object(4)
memory usage: 57.5+ MB
                    Name  dtypes  Missing  Uniques
27         InsuredInfo_7  object        0        2
114       Product_Info_2  object        0       19
122  Product_Info_2_char  object        0        5
123   Product_Info_2_num  object        0        8


In [8]:
# Numérisation des caractères : one-hot code

from sklearn.preprocessing import LabelEncoder

label_encode = LabelEncoder()
all_data['Product_Info_2'] = label_encode.fit_transform(all_data['Product_Info_2'])
all_data['InsuredInfo_7'] = label_encode.fit_transform(all_data['InsuredInfo_7'])
all_data['Product_Info_2_char'] = label_encode.fit_transform(all_data['Product_Info_2_char'])
all_data['Product_Info_2_num'] = label_encode.fit_transform(all_data['Product_Info_2_num'])

from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(categorical_features=[27, 114, 122, 123], sparse=False)
all_data = one_hot.fit_transform(all_data)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


5. PCA dimensionality reduction

In [9]:
# from sklearn.decomposition import PCA

# estimator = PCA(n_components='mle')
# pca_all_data = estimator.fit_transform(all_data)

# print(estimator.n_components_)

# print(estimator.explained_variance_ratio_)

6. Sélection des caractères

In [10]:
# from sklearn import feature_selection

# # si le variance < 0.005 , nous supprimons cette caractère
# sele = feature_selection.VarianceThreshold(threshold=0.005)

# X_train = sele.fit_transform(X_train)

7. Standardisation des données

In [11]:
# Standardization des données

# from sklearn.preprocessing import StandardScaler

# ss= StandardScaler()
# X_train = ss.fit_transform(X_train)
# X_predict = ss.fit_transform(X_predict)


In [12]:
all_data = pd.DataFrame(all_data)

# split train and test
train = all_data[all_data[152] > 0].copy()
predict = all_data[all_data[152] < 0].copy()

digits_data = train
digits_target = train[152]
del digits_data[152]
del predict[152]

7. Formation de modèle

In [13]:
# Split data set
# Parce que le type de distribution label n'est pas uniforme,
# j'ai utilisé l'échantillonnage stratifié


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(digits_data, digits_target, test_size=0.25,
                                                    random_state=42, stratify=digits_target)

5.1 LinearSVC

In [14]:
'''
from sklearn.svm import LinearSVC

lsvc = LinearSVC()
lsvc.fit(X_train,y_train)

y_predict = lsvc.predict(X_test)

print(lsvc.score(X_test,y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test,y_predict))
'''

'\nfrom sklearn.svm import LinearSVC\n\nlsvc = LinearSVC()\nlsvc.fit(X_train,y_train)\n\ny_predict = lsvc.predict(X_test)\n\nprint(lsvc.score(X_test,y_test))\n\nfrom sklearn.metrics import classification_report\n\nprint(classification_report(y_test,y_predict))\n'

In [15]:
'''
lsvc.fit(pca_X_train,y_train)

pca_y_predict = lsvc.predict(pca_X_test)

print(lsvc.score(pca_X_test,y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test,pca_y_predict))
'''

'\nlsvc.fit(pca_X_train,y_train)\n\npca_y_predict = lsvc.predict(pca_X_test)\n\nprint(lsvc.score(pca_X_test,y_test))\n\nfrom sklearn.metrics import classification_report\n\nprint(classification_report(y_test,pca_y_predict))\n'

5.2 GBDT

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_predict = gbc.predict(X_test)

print(gbc.score(X_test, y_test))

from sklearn.metrics import classification_report
 
print(classification_report(y_test, gbc_y_predict))

0.5447132372927426
              precision    recall  f1-score   support

         1.0       0.47      0.22      0.30      1539
         2.0       0.47      0.26      0.34      1624
         3.0       0.37      0.06      0.11       252
         4.0       0.35      0.11      0.17       355
         5.0       0.58      0.56      0.57      1347
         6.0       0.41      0.52      0.46      2783
         7.0       0.46      0.37      0.41      1986
         8.0       0.66      0.88      0.75      4830

    accuracy                           0.54     14716
   macro avg       0.47      0.37      0.39     14716
weighted avg       0.52      0.54      0.52     14716



In [17]:
'''
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(pca_X_train, y_train)
gbc_y_predict = gbc.predict(pca_X_test)

print(gbc.score(pca_X_test, y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test, gbc_y_predict))
'''

'\nfrom sklearn.ensemble import GradientBoostingClassifier\ngbc = GradientBoostingClassifier()\ngbc.fit(pca_X_train, y_train)\ngbc_y_predict = gbc.predict(pca_X_test)\n\nprint(gbc.score(pca_X_test, y_test))\n\nfrom sklearn.metrics import classification_report\n\nprint(classification_report(y_test, gbc_y_predict))\n'