Data Preprocessing & Model Training

1. Missing value processing

In [1]:
import numpy as np
import pandas as pd

In [2]:
X_train = pd.read_csv('../data_input/train.csv')
X_predict = pd.read_csv('../data_input/predict.csv')

In [3]:
# Separate the labels
y_train = X_train['Response']
del X_train['Response']

In [4]:
def description(df):
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    # Count the number of missing values in each column
    summary['Missing'] = df.isnull().sum().values    
    # Count the number of different values in each column, excluding null values
    summary['Uniques'] = df.nunique().values
    return summary

In [5]:
des_train = description(X_train)
des_train['Missing'] = des_train['Missing']/X_train.shape[0]
print(des_train[des_train['Missing']!=0].sort_values(by=['Missing'],ascending=False)[['Name','Missing']])

# if the Missing rate > 70% , we drop this feature
cols = des_train[des_train['Missing']>=0.7]['Name']
print (cols)

X_train = X_train.drop(cols, axis = 1)

                   Name   Missing
48   Medical_History_10  0.990642
70   Medical_History_32  0.981318
62   Medical_History_24  0.935939
24        InsuredInfo_8  0.879197
53   Medical_History_15  0.750972
25        InsuredInfo_9  0.749410
38        Family_Hist_5  0.703996
36        Family_Hist_3  0.576621
35        Family_Hist_2  0.482702
30  Insurance_History_5  0.427642
37        Family_Hist_4  0.323177
16    Employment_Info_6  0.182623
39    Medical_History_1  0.149607
14    Employment_Info_4  0.114078
11    Employment_Info_1  0.000323
24         InsuredInfo_8
25         InsuredInfo_9
38         Family_Hist_5
48    Medical_History_10
53    Medical_History_15
62    Medical_History_24
70    Medical_History_32
Name: Name, dtype: object


In [6]:
print(X_train.shape)

(58881, 121)


In [7]:
# for the other Missing feature , we repalce NaN with the meaning value
cols = des_train[des_train['Missing']<0.7]['Name']
X_train[cols] = X_train[cols].fillna(X_train[cols].mean())

In [8]:
print(X_train.isnull().sum().values)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


In [9]:
des_predict = description(X_predict)

des_predict['Missing'] = des_predict['Missing']/X_predict.shape[0]
print(des_predict[des_predict['Missing']!=0].sort_values(by=['Missing'],ascending=False)[['Name','Missing']])

cols = des_predict[des_predict['Missing']>=0.7]['Name']
print (cols)

X_predict = X_predict.drop(cols, axis = 1)

                   Name  Missing
48   Medical_History_10    0.988
70   Medical_History_32    0.986
62   Medical_History_24    0.942
24        InsuredInfo_8    0.902
53   Medical_History_15    0.756
25        InsuredInfo_9    0.742
38        Family_Hist_5    0.718
36        Family_Hist_3    0.578
35        Family_Hist_2    0.468
30  Insurance_History_5    0.432
37        Family_Hist_4    0.310
16    Employment_Info_6    0.202
39    Medical_History_1    0.160
14    Employment_Info_4    0.124
24         InsuredInfo_8
25         InsuredInfo_9
38         Family_Hist_5
48    Medical_History_10
53    Medical_History_15
62    Medical_History_24
70    Medical_History_32
Name: Name, dtype: object


In [10]:
print(X_predict.shape)
# we drop the same feature !!!

(500, 121)


In [11]:
# for the other Missing feature , we repalce NaN with the meaning value
cols = des_predict[des_predict['Missing']<0.7]['Name']
X_predict[cols] = X_predict[cols].fillna(X_predict[cols].mean())

In [12]:
print(X_predict.isnull().sum().values)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


2. Exception value processing

In [13]:
# Remove duplicate lines
# X_train=X_train.drop_duplicates()
# print(X_train.shape)

# X_predict=X_predict.drop_duplicates()
# print(X_predict.shape)

3. Data normalization and standardization

In [14]:
X_train.info()

des = description(X_train)
print(des[des['dtypes']=='object'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58881 entries, 0 to 58880
Columns: 121 entries, Product_Info_1 to Medical_Keyword_48
dtypes: float64(13), int64(106), object(2)
memory usage: 54.4+ MB
              Name  dtypes  Missing  Uniques
1   Product_Info_2  object        0       19
23   InsuredInfo_7  object        0        2


In [15]:
# Feature digitization : one-hot code
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer(sparse=False)

X_train = vec.fit_transform(X_train.to_dict(orient='record'))

In [16]:
X_predict.info()

des = description(X_predict)
print(des[des['dtypes']=='object'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Columns: 121 entries, Product_Info_1 to Medical_Keyword_48
dtypes: float64(13), int64(106), object(2)
memory usage: 472.8+ KB
              Name  dtypes  Missing  Uniques
1   Product_Info_2  object        0       16
23   InsuredInfo_7  object        0        2


In [17]:
X_predict = vec.transform(X_predict.to_dict(orient='record'))

In [18]:
# Data standardization

from sklearn.preprocessing import StandardScaler

ss= StandardScaler()
X_train = ss.fit_transform(X_train)
X_predict = ss.transform(X_predict)

Model Training

In [20]:
# Split data set
from sklearn.model_selection import train_test_split

digits_data = X_train
digits_target = y_train

X_train,X_test,y_train,y_test = train_test_split(digits_data,digits_target,test_size=0.2,random_state=42)

In [21]:
# PCA dimensionality reduction
from sklearn.decomposition import PCA

estimator = PCA(n_components=20)
pca_X_train = estimator.fit_transform(X_train)
pca_X_test = estimator.transform(X_test)


In [22]:
# LinearSVC
from sklearn.svm import LinearSVC

lsvc = LinearSVC()
lsvc.fit(X_train,y_train)

y_predict = lsvc.predict(X_test)

print(lsvc.score(X_test,y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test,y_predict))

0.4821261781438397
              precision    recall  f1-score   support

           1       0.40      0.22      0.28      1241
           2       0.37      0.22      0.27      1336
           3       0.00      0.00      0.00       206
           4       0.11      0.01      0.01       279
           5       0.46      0.31      0.37      1060
           6       0.36      0.41      0.39      2186
           7       0.37      0.22      0.28      1575
           8       0.57      0.91      0.70      3894

    accuracy                           0.48     11777
   macro avg       0.33      0.29      0.29     11777
weighted avg       0.43      0.48      0.44     11777



  'precision', 'predicted', average, warn_for)


In [23]:
lsvc.fit(pca_X_train,y_train)

y_predict = lsvc.predict(pca_X_test)

print(lsvc.score(pca_X_test,y_test))

from sklearn.metrics import classification_report

print(classification_report(y_test,y_predict))

0.42311284707480684
              precision    recall  f1-score   support

           1       0.34      0.22      0.26      1241
           2       0.35      0.20      0.25      1336
           3       0.00      0.00      0.00       206
           4       0.00      0.00      0.00       279
           5       0.32      0.05      0.09      1060
           6       0.27      0.34      0.30      2186
           7       0.33      0.03      0.06      1575
           8       0.50      0.93      0.65      3894

    accuracy                           0.42     11777
   macro avg       0.26      0.22      0.20     11777
weighted avg       0.36      0.42      0.34     11777



  'precision', 'predicted', average, warn_for)
