# <span style="color:orange;">1.</span> Imports

## <span style="color:orange;">1.1.</span>  Libraries


In [58]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler

## <span style="color:orange;">1.2.</span>  Loas Data


In [59]:
df = pd.read_csv('../src/data/interim/train_processed_01.csv')

# <span style="color:orange;">5.</span>  Split Into Training and Test Set

In [60]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'response'], df['response'], test_size=0.2, random_state=32)

# Train Test Split


In [61]:
X_train.dtypes

gender                        object
age                            int64
driving_license                int64
region_code                  float64
previously_insured             int64
vehicle_age                   object
vehicle_damage                 int64
annual_premium               float64
policy_sales_channel         float64
vintage                        int64
hi_customer_profitability    float64
age_binned                    object
dtype: object

# <span style="color:orange;">3.</span> Data Preparation

## <span style="color:orange;">3.1.</span> Rescaling

In [62]:
rs = RobustScaler()
mms = MinMaxScaler()

# age - MinMaxScaler
X_train.loc[:, 'age_scaled'] = mms.fit_transform(X_train[['age']])
X_test.loc[:, 'age_scaled'] = mms.transform(X_test[['age']])
pickle.dump(mms, open('../src/parameters/age_scaler.pkl', 'wb'))

# annual_premium - RobustScaler
X_train.loc[:, 'annual_premium_scaled'] = rs.fit_transform(X_train[['annual_premium']])
X_test.loc[:, 'annual_premium_scaled'] = rs.transform(X_test[['annual_premium']])
pickle.dump(rs, open('../src/parameters/annual_premium_scaler.pkl', 'wb'))

# vintage - MinMaxScaler
X_train.loc[:, 'vintage_scaled'] = mms.fit_transform(X_train[['vintage']])
X_test.loc[:, 'vintage_scaled'] = mms.transform(X_test[['vintage']])
pickle.dump(mms, open('../src/parameters/vintage_scaler.pkl', 'wb'))

# hi_customer_profitability - RobustScaler
X_train.loc[:, 'hi_customer_profitability_scaled'] = rs.fit_transform(X_train[['hi_customer_profitability']])
X_test.loc[:, 'hi_customer_profitability_scaled'] = rs.transform(X_test[['hi_customer_profitability']])
pickle.dump(rs, open('../src/parameters/hi_customer_profitability_scaler.pkl', 'wb'))

## <span style="color:orange;">3.2.</span> Encoding

In [63]:
# gender - One Hot Encoding
X_train['gender'] = X_train['gender'].map({'male': 1, 'female': 0})
X_test['gender'] = X_test['gender'].map({'male': 1, 'female': 0})

# region_code - Target Encoding
target_encode_region_code = pd.concat([X_train, y_train], axis=1).groupby('region_code')['response'].mean()
X_train['region_code'] = X_train['region_code'].map(target_encode_region_code)
X_test['region_code'] = X_test['region_code'].map(target_encode_region_code)
pickle.dump(target_encode_region_code, open('../src/parameters/region_code_scaler.pkl', 'wb'))

# vehicle_age - One Hot Encoding
X_train = pd.get_dummies(X_train, columns=['vehicle_age'], prefix='vehicle_age')
X_test = pd.get_dummies(X_test, columns=['vehicle_age'], prefix='vehicle_age')

# policy_sales_channel - Frequency Encoding
frequency_encode_sales_channel = X_train['policy_sales_channel'].value_counts(normalize=True)
X_train['policy_sales_channel'] = X_train['policy_sales_channel'].map(frequency_encode_sales_channel)
X_test['policy_sales_channel'] = X_test['policy_sales_channel'].map(frequency_encode_sales_channel)
pickle.dump(frequency_encode_sales_channel, open('../src/parameters/policy_sales_channel_scaler.pkl', 'wb'))