Data Preprocessing & Model Training

1. Missing value processing

In [108]:
import numpy as np
import pandas as pd

In [109]:
X_train = pd.read_csv('../data_input/train.csv')
X_predict = pd.read_csv('../data_input/predict.csv')

In [110]:
# Separate the labels
y_train = X_train['Response']
del X_train['Response']

In [111]:
def description(df):
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    # Count the number of missing values in each column
    summary['Missing'] = df.isnull().sum().values    
    # Count the number of different values in each column, excluding null values
    summary['Uniques'] = df.nunique().values
    return summary

In [112]:
des_train = description(X_train)
des_train['Missing'] = des_train['Missing']/X_train.shape[0]
print(des_train[des_train['Missing']!=0].sort_values(by=['Missing'],ascending=False)[['Name','Missing']])

# if the Missing rate > 70% , we drop this feature
cols = des_train[des_train['Missing']>=0.7]['Name']
print (cols)

X_train = X_train.drop(cols, axis = 1)

                   Name   Missing
48   Medical_History_10  0.990642
70   Medical_History_32  0.981318
62   Medical_History_24  0.935939
24        InsuredInfo_8  0.879197
53   Medical_History_15  0.750972
25        InsuredInfo_9  0.749410
38        Family_Hist_5  0.703996
36        Family_Hist_3  0.576621
35        Family_Hist_2  0.482702
30  Insurance_History_5  0.427642
37        Family_Hist_4  0.323177
16    Employment_Info_6  0.182623
39    Medical_History_1  0.149607
14    Employment_Info_4  0.114078
11    Employment_Info_1  0.000323
24         InsuredInfo_8
25         InsuredInfo_9
38         Family_Hist_5
48    Medical_History_10
53    Medical_History_15
62    Medical_History_24
70    Medical_History_32
Name: Name, dtype: object


In [113]:
print(X_train.shape)

(58881, 121)


In [114]:
# for the other Missing feature , we repalce NaN with the meaning value
cols = des_train[des_train['Missing']<0.7]['Name']
X_train[cols] = X_train[cols].fillna(X_train[cols].mean())

In [115]:
print(X_train.isnull().sum().values)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


In [116]:
des_predict = description(X_predict)

des_predict['Missing'] = des_predict['Missing']/X_predict.shape[0]
print(des_predict[des_predict['Missing']!=0].sort_values(by=['Missing'],ascending=False)[['Name','Missing']])

cols = des_predict[des_predict['Missing']>=0.7]['Name']
print (cols)

X_predict = X_predict.drop(cols, axis = 1)

                   Name  Missing
48   Medical_History_10    0.988
70   Medical_History_32    0.986
62   Medical_History_24    0.942
24        InsuredInfo_8    0.902
53   Medical_History_15    0.756
25        InsuredInfo_9    0.742
38        Family_Hist_5    0.718
36        Family_Hist_3    0.578
35        Family_Hist_2    0.468
30  Insurance_History_5    0.432
37        Family_Hist_4    0.310
16    Employment_Info_6    0.202
39    Medical_History_1    0.160
14    Employment_Info_4    0.124
24         InsuredInfo_8
25         InsuredInfo_9
38         Family_Hist_5
48    Medical_History_10
53    Medical_History_15
62    Medical_History_24
70    Medical_History_32
Name: Name, dtype: object


In [117]:
print(X_predict.shape)
# we drop the same feature !!!

(500, 121)


In [118]:
# for the other Missing feature , we repalce NaN with the meaning value
cols = des_predict[des_predict['Missing']<0.7]['Name']
X_predict[cols] = X_predict[cols].fillna(X_predict[cols].mean())

In [119]:
print(X_predict.isnull().sum().values)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


2. Exception value processing

In [120]:
# Remove duplicate lines
X_train=X_train.drop_duplicates()
print(X_train.shape)

X_predict=X_predict.drop_duplicates()
print(X_predict.shape)

(58849, 121)
(500, 121)


3. Data normalization and standardization

In [123]:
X_train.info()

des = description(X_train)
print(des[des['dtypes']=='object'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58849 entries, 0 to 58880
Columns: 121 entries, Product_Info_1 to Medical_Keyword_48
dtypes: float64(13), int64(106), object(2)
memory usage: 54.8+ MB
              Name  dtypes  Missing  Uniques
1   Product_Info_2  object        0       19
23   InsuredInfo_7  object        0        2


In [125]:
# Feature digitization : one-hot code
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer(sparse=False)

X_train = vec.fit_transform(X_train.to_dict(orient='record'))

In [126]:
X_predict.info()

des = description(X_predict)
print(des[des['dtypes']=='object'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Columns: 121 entries, Product_Info_1 to Medical_Keyword_48
dtypes: float64(13), int64(106), object(2)
memory usage: 476.6+ KB
              Name  dtypes  Missing  Uniques
1   Product_Info_2  object        0       16
23   InsuredInfo_7  object        0        2


In [127]:
X_predict = vec.transform(X_predict.to_dict(orient='record'))

In [128]:
# Data standardization

from sklearn.preprocessing import StandardScaler

ss= StandardScaler()
X_train = ss.fit_transform(X_train)
X_predict = ss.transform(X_predict)