In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings(action='once')

In [2]:
train = pd.read_csv('../data/raw_data/train.csv')
test = pd.read_csv('../data/raw_data/test.csv')
train.shape, test.shape

((260753, 299), (173836, 298))

In [3]:
train['Original_Quote_Date'] = pd.to_datetime(train['Original_Quote_Date'], errors='coerce')
test['Original_Quote_Date'] = pd.to_datetime(test['Original_Quote_Date'], errors='coerce')
train['Original_Quote_Date'] = train['Original_Quote_Date'].dt.dayofweek
test['Original_Quote_Date'] = test['Original_Quote_Date'].dt.dayofweek
transform_num_day_of_week = {0: 'Monday',
                             1: 'Tuesday',
                             2: 'Wednesday',
                             3: 'Thursday',
                             4: 'Friday',
                             5: 'Saturday',
                             6: 'Sunday'}
train['Original_Quote_day_of_week'] = train['Original_Quote_Date'].map(transform_num_day_of_week)
test['Original_Quote_day_of_week'] = test['Original_Quote_Date'].map(transform_num_day_of_week)

In [4]:
# impute PersonalField84 by a new class
train['PersonalField84'].fillna(999, inplace=True)
test['PersonalField84'].fillna(999, inplace=True)
# impute PropertyField29 by a new class
train['PropertyField29'].fillna(999, inplace=True)
test['PropertyField29'].fillna(999, inplace=True)

In [5]:
col_to_drop = ['Original_Quote_Date','QuoteNumber','SalesField8']
for col in col_to_drop:
    train.drop(col,axis=1,inplace=True)
    test.drop(col,axis=1,inplace=True)
train.shape, test.shape

((260753, 297), (173836, 296))

In [6]:
for col in test.columns:  # impute by mode 
    if train[col].isnull().sum()>0 or test[col].isnull().sum()>0:
        mode = train[col].value_counts().index[0]
        train[col].fillna(mode,inplace=True)
        test[col].fillna(mode,inplace=True)

In [7]:
X_train = train.drop('QuoteConversion_Flag',axis=1)
y_train = train['QuoteConversion_Flag']
X_test = test

# try target encoding in this iteration!
from category_encoders import TargetEncoder 
te = TargetEncoder(cols = test.columns.to_list()).fit(X_train,y_train)
X_train_encoded = te.transform(X_train)
X_test_encoded = te.transform(X_test)

  return f(*args, **kwds)


In [8]:
new_train = pd.concat([X_train_encoded,y_train],axis=1)
new_test = X_test_encoded

In [12]:
new_train.shape,new_test.shape

((260753, 297), (173836, 296))

In [9]:
pd.DataFrame(new_train).to_csv('../data/featured_data/train.csv',index=False)
pd.DataFrame(new_test).to_csv('../data/featured_data/test.csv',index=False)