In [1]:
### importing packages ###
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from datetime import datetime, date, time

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from matplotlib.pyplot import figure
from pylab import rcParams
import joblib

In [3]:
import random
import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import chi2_contingency, chisquare, lognorm, kstest, shapiro, normaltest, chi2, levene, ttest_ind, t

#### importing pickled dataset

In [4]:
#### import the pickled X_train dataset ####
X_train_eda = joblib.load("X_train_eda.pkl")
X_train_eda.shape

(381109, 17)

#### Feature Engineering

#### 1. f_region_cd_cnt

In [5]:
#### probability of region code rare ####
df_region_cd_cnt = pd.DataFrame(X_train_eda.groupby(['region_cd']).size())
df_region_cd_cnt = df_region_cd_cnt.reset_index()
df_region_cd_cnt = df_region_cd_cnt.rename(columns = {0 : 'f_region_cd_cnt'})
print(df_region_cd_cnt.shape)

# adding new column "f_region_cd_cnt"
X_train_eda = pd.merge(X_train_eda, df_region_cd_cnt, how = 'inner', on= ['region_cd'])
X_train_eda.shape

(53, 2)


(381109, 18)

#### 2. f_vehicle_age_p

In [6]:
#### probability of Vehicle Age ####
df_vehicle_age_p = pd.DataFrame(X_train_eda.groupby(['vehicle_age']).size()/X_train_eda.shape[0])
df_vehicle_age_p = df_vehicle_age_p.reset_index()
df_vehicle_age_p = df_vehicle_age_p.rename(columns = {0 : 'f_vehicle_age_p'})
print(df_vehicle_age_p.shape)

# adding new column "f_vehicle_age_p"
X_train_eda = pd.merge(X_train_eda, df_vehicle_age_p, how = 'inner', on= ['vehicle_age'] )
X_train_eda.shape

(3, 2)


(381109, 19)

#### 3. f_policy_channel_p

In [7]:
#### probability of Policy Channel rare ####
df_policy_channel_p = pd.DataFrame(X_train_eda.groupby(['sales_channel_rare']).size()/X_train_eda.shape[0])
df_policy_channel_p = df_policy_channel_p.reset_index()
df_policy_channel_p = df_policy_channel_p.rename(columns = {0 : 'f_policy_channel_p'})
print(df_policy_channel_p.shape)

# adding new column "f_policy_channel_p"
X_train_eda = pd.merge(X_train_eda, df_policy_channel_p, how = 'inner', on= ['sales_channel_rare'] )
X_train_eda.shape

(10, 2)


(381109, 20)

#### 4. f_vintage_cnt

In [8]:
#### probability of Vintage Cnt ####
df_vintage_cnt_p = pd.DataFrame(X_train_eda.groupby(['vintage']).size())
df_vintage_cnt_p = df_vintage_cnt_p.reset_index()
df_vintage_cnt_p = df_vintage_cnt_p.rename(columns = {0 : 'f_vintage_cnt'})
print(df_vintage_cnt_p.shape)

# adding new column "f_vintage_cnt"
X_train_eda = pd.merge(X_train_eda, df_vintage_cnt_p, how = 'inner', on= ['vintage'] )
X_train_eda.shape

(290, 2)


(381109, 21)

#### 5. f_age_bins_p

In [9]:
#### probability of Policy Channel rare ####
df_age_bins_p = pd.DataFrame(X_train_eda.groupby(['age_bins']).size()/X_train_eda.shape[0])
df_age_bins_p = df_age_bins_p.reset_index()
df_age_bins_p = df_age_bins_p.rename(columns = {0 : 'f_age_bins_p'})
print(df_age_bins_p.shape)

# adding new column "f_policy_channel_p"
X_train_eda = pd.merge(X_train_eda, df_age_bins_p, how = 'inner', on= ['age_bins'] )
X_train_eda.shape


(4, 2)


(381109, 22)

#### 6. f_age_bins_premium_median

In [10]:
#### age bins and premium amount mean ####
df_age_bins_premium_me = pd.DataFrame(X_train_eda.groupby(['age_bins'])['annual_premium_o'].median())
df_age_bins_premium_me = df_age_bins_premium_me.reset_index()
df_age_bins_premium_me = df_age_bins_premium_me.rename(columns = {'annual_premium_o' : 'f_age_bins_premium_me'})
print(df_age_bins_premium_me.shape)

# adding new column "f_policy_channel_p"
X_train_eda = pd.merge(X_train_eda, df_age_bins_premium_me, how = 'inner', on= ['age_bins'] )
X_train_eda.shape


(4, 2)


(381109, 23)

#### 7. f_vehicle_age_premium_std

In [11]:
#### Vehicle age and premium amount mean ####
df_vehicle_age_premium_s = pd.DataFrame(X_train_eda.groupby(['vehicle_age'])['annual_premium_o'].std())
df_vehicle_age_premium_s = df_vehicle_age_premium_s.reset_index()
df_vehicle_age_premium_s = df_vehicle_age_premium_s.rename(columns = {'annual_premium_o' : 'f_vehicle_age_premium_std'})
print(df_vehicle_age_premium_s.shape)

# adding new column "f_policy_channel_p"
X_train_eda = pd.merge(X_train_eda, df_vehicle_age_premium_s, how = 'inner', on= ['vehicle_age'] )
X_train_eda.shape


(3, 2)


(381109, 24)

#### 8. f_channel_premium_std

In [12]:
#### Vehicle age and premium amount mean ####
df_channel_premium_s = pd.DataFrame(X_train_eda.groupby(['sales_channel_rare'])['annual_premium_o'].std())
df_channel_premium_s = df_channel_premium_s.reset_index()
df_channel_premium_s = df_channel_premium_s.rename(columns = {'annual_premium_o' : 'f_channel_premium_std'})
print(df_channel_premium_s.shape)

# adding new column "f_policy_channel_p"
X_train_eda = pd.merge(X_train_eda, df_channel_premium_s, how = 'inner', on= ['sales_channel_rare'] )
X_train_eda.shape


(10, 2)


(381109, 25)

#### 9. f_region_premium_std

In [13]:
#### Vehicle age and premium amount mean ####
df_region_premium_s = pd.DataFrame(X_train_eda.groupby(['region_cd_rare'])['annual_premium_o'].std())
df_region_premium_s = df_region_premium_s.reset_index()
df_region_premium_s = df_region_premium_s.rename(columns = {'annual_premium_o' : 'f_region_premium_m'})
print(df_region_premium_s.shape)

# adding new column "f_policy_channel_p"
X_train_eda = pd.merge(X_train_eda, df_region_premium_s, how = 'inner', on= ['region_cd_rare'] )
X_train_eda.shape


(26, 2)


(381109, 26)

#### 10. f_licence_premium_mean

In [14]:
#### Vehicle age and premium amount mean ####
df_licence_premium_m = pd.DataFrame(X_train_eda.groupby(['driving_licence'])['annual_premium_o'].mean())
df_licence_premium_m = df_licence_premium_m.reset_index()
df_licence_premium_m = df_licence_premium_m.rename(columns = {'annual_premium_o' : 'f_licence_premium_m'})
print(df_licence_premium_m.shape)

# adding new column "f_policy_channel_p"
X_train_eda = pd.merge(X_train_eda, df_licence_premium_m, how = 'inner', on= ['driving_licence'] )
X_train_eda.shape

(2, 2)


(381109, 27)

#### 11. f_g_age_bins_premium_std

In [15]:
#### Vehicle age and premium amount mean ####
df_g_age_bins_premium_s = pd.DataFrame(X_train_eda.groupby(['gender', 'age_bins'])['annual_premium_o'].size())
df_g_age_bins_premium_s = df_g_age_bins_premium_s.reset_index()
df_g_age_bins_premium_s = df_g_age_bins_premium_s.rename(columns = {'annual_premium_o' : 'f_g_age_bins_premium_std'})
print(df_g_age_bins_premium_s.shape)

# adding new column "f_policy_channel_p"
X_train_eda = pd.merge(X_train_eda, df_g_age_bins_premium_s, how = 'inner', on= ['gender', 'age_bins'] )
X_train_eda.shape


(8, 3)


(381109, 28)

#### 12. f_v_dam_age_premium_mean

In [16]:
#### Vehicle age and premium amount mean ####
df_v_dam_age_premium_m = pd.DataFrame(X_train_eda.groupby(['vehicle_damage', 'vehicle_age'])['annual_premium_o'].mean())
df_v_dam_age_premium_m  = df_v_dam_age_premium_m.reset_index()
df_v_dam_age_premium_m  = df_v_dam_age_premium_m.rename(columns = {'annual_premium_o' : 'f_v_dam_age_premium_m'})
print(df_v_dam_age_premium_m .shape)

# adding new column "f_policy_channel_p"
X_train_eda = pd.merge(X_train_eda, df_v_dam_age_premium_m , how = 'inner', on= ['vehicle_damage', 'vehicle_age'] )
X_train_eda.shape

(6, 3)


(381109, 29)

#### 13. f_insured_dam_premium_std

In [17]:
#### Vehicle age and premium amount mean ####
df_insured_dam_premium_s = pd.DataFrame(X_train_eda.groupby(['previously_insured', 'vehicle_damage'])['annual_premium_o'].std())
df_insured_dam_premium_s  = df_insured_dam_premium_s.reset_index()
df_insured_dam_premium_s  = df_insured_dam_premium_s.rename(columns = {'annual_premium_o' : 'f_insured_dam_premium_s'})
print(df_insured_dam_premium_s .shape)

# adding new column "f_policy_channel_p"
X_train_eda = pd.merge(X_train_eda, df_insured_dam_premium_s , how = 'inner', on= ['previously_insured', 'vehicle_damage'] )
X_train_eda.shape


(4, 3)


(381109, 30)

#### 14. f_age_dist_cnt

In [18]:
#### Distribution of Age ####
df_age_dist_cnt = pd.DataFrame(X_train_eda.groupby(['age']).size())
df_age_dist_cnt = df_age_dist_cnt.reset_index()
df_age_dist_cnt = df_age_dist_cnt.rename(columns = {0 : 'f_age_dist_cnt'})
print(df_age_dist_cnt.shape)

# adding new column "f_vintage_cnt"
X_train_eda = pd.merge(X_train_eda, df_age_dist_cnt, how = 'inner', on= ['age'] )
X_train_eda.shape

(66, 2)


(381109, 31)

#### 15. f_premium_more_10k

In [19]:
#### 1. gender ####
X_train_eda['f_premium_more_10k'] = np.where(X_train_eda['annual_premium_o'] > 10000, 1, 0)
X_train_eda.shape

(381109, 32)

#### 16. f_licence_bins_mean

In [20]:
#### Distribution of Age ####
df_dl_bins_premium_m = pd.DataFrame(X_train_eda.groupby(['driving_licence', 'age_bins'])['annual_premium_o'].mean())
df_dl_bins_premium_m = df_dl_bins_premium_m.reset_index()
df_dl_bins_premium_m = df_dl_bins_premium_m.rename(columns = {'annual_premium_o' : 'f_licence_bins_mean'})
print(df_dl_bins_premium_m.shape)

# adding new column "f_vintage_cnt"
X_train_eda = pd.merge(X_train_eda, df_dl_bins_premium_m, how = 'inner', on= ['driving_licence', 'age_bins'] )
X_train_eda.shape

(8, 3)


(381109, 33)

#### 17. f_insure_vehicle_age_dam_mean

In [21]:
#### Distribution of Age ####
df_insure_vehicle_age_dam_m = pd.DataFrame(X_train_eda.groupby(['previously_insured', 'vehicle_age', 'vehicle_damage'])['annual_premium_o'].mean())
df_insure_vehicle_age_dam_m = df_insure_vehicle_age_dam_m.reset_index()
df_insure_vehicle_age_dam_m = df_insure_vehicle_age_dam_m.rename(columns = {'annual_premium_o' : 'f_insure_v_age_dam_premium_m'})
print(df_insure_vehicle_age_dam_m.shape)

# adding new column "f_vintage_cnt"
X_train_eda = pd.merge(X_train_eda, df_insure_vehicle_age_dam_m, how = 'inner', on= ['previously_insured', 'vehicle_age', 'vehicle_damage'] )
X_train_eda.shape
# mean with bimodel

(12, 4)


(381109, 34)

#### 18. f_region_channel_premium_std

In [22]:
#### Distribution of Age ####
df_reg_cha_premium_s = pd.DataFrame(X_train_eda.groupby(['region_cd_rare', 'sales_channel_rare'])['annual_premium_o'].std())
df_reg_cha_premium_s = df_reg_cha_premium_s.reset_index()
df_reg_cha_premium_s = df_reg_cha_premium_s.rename(columns = {'annual_premium_o' : 'f_reg_cha_premium_std'})
print(df_reg_cha_premium_s.shape)

# adding new column "f_vintage_cnt"
X_train_eda = pd.merge(X_train_eda, df_reg_cha_premium_s, how = 'inner', on= ['region_cd_rare', 'sales_channel_rare'] )
X_train_eda.shape


(260, 3)


(381109, 35)

#### 19. f_insure_region_premium_std

In [23]:
#### Distribution of Age ####
df_insure_reg_premium_s = pd.DataFrame(X_train_eda.groupby(['previously_insured', 'region_cd_rare'])['annual_premium_o'].std())
df_insure_reg_premium_s = df_insure_reg_premium_s.reset_index()
df_insure_reg_premium_s = df_insure_reg_premium_s.rename(columns = {'annual_premium_o' : 'f_insure_region_premium_m'})
print(df_insure_reg_premium_s.shape)

# adding new column "f_vintage_cnt"
X_train_eda = pd.merge(X_train_eda, df_insure_reg_premium_s, how = 'inner', on= ['previously_insured', 'region_cd_rare'] )
X_train_eda.shape

(52, 3)


(381109, 36)

#### 20. f_insure_channel_premium_std

In [24]:
#### Distribution of Age ####
df_insure_cha_premium_s = pd.DataFrame(X_train_eda.groupby(['previously_insured', 'sales_channel_rare'])['annual_premium_o'].std())
df_insure_cha_premium_s = df_insure_cha_premium_s.reset_index()
df_insure_cha_premium_s = df_insure_cha_premium_s.rename(columns = {'annual_premium_o' : 'f_insure_cha_premium_s'})
print(df_insure_cha_premium_s.shape)

# adding new column "f_vintage_cnt"
X_train_eda = pd.merge(X_train_eda, df_insure_cha_premium_s, how = 'inner', on= ['previously_insured', 'sales_channel_rare'] )
X_train_eda.shape

(20, 3)


(381109, 37)

#### 21. f_region_cd_p

In [25]:
#### probability of region code rare ####
df_region_cd_p = pd.DataFrame(X_train_eda.groupby(['region_cd_rare']).size()/X_train_eda.shape[0])
df_region_cd_p = df_region_cd_p.reset_index()
df_region_cd_p = df_region_cd_p.rename(columns = {0 : 'f_region_cd_rare_p'})
print(df_region_cd_p.shape)

# adding new column "f_region_cd_rare_p"
X_train_eda = pd.merge(X_train_eda, df_region_cd_p, how = 'inner', on= ['region_cd_rare'] )
X_train_eda.shape

(26, 2)


(381109, 38)

#### 22. f_policy_channel_cnt

In [26]:
#### probability of region code rare ####
df_policy_cnt = pd.DataFrame(X_train_eda.groupby(['policy_sales_channel']).size())
df_policy_cnt = df_policy_cnt.reset_index()
df_policy_cnt = df_policy_cnt.rename(columns = {0 : 'f_channel_cnt'})
print(df_policy_cnt.shape)

# adding new column "f_region_cd_cnt"
X_train_eda = pd.merge(X_train_eda, df_policy_cnt, how = 'left', on= ['policy_sales_channel'])
X_train_eda.shape

(155, 2)


(381109, 39)

#### pickling the variables ####

In [27]:
#### pickling the train dataset ####
print(X_train_eda.shape)
joblib.dump(X_train_eda, "X_train_fe.pkl")

(381109, 39)


['X_train_fe.pkl']

In [28]:
#### Pickling Feature Engineering variables ####
joblib.dump((df_region_cd_cnt, df_vehicle_age_p, df_policy_channel_p, df_vintage_cnt_p, df_age_bins_p, df_age_bins_premium_me, df_vehicle_age_premium_s, df_channel_premium_s, df_region_premium_s, df_licence_premium_m), "fe_1.pkl")

['fe_1.pkl']

In [29]:
#### importing Feature Engineering variables 2 ####
joblib.dump((df_g_age_bins_premium_s, df_v_dam_age_premium_m, df_insured_dam_premium_s, df_age_dist_cnt, df_dl_bins_premium_m, df_insure_vehicle_age_dam_m, df_reg_cha_premium_s, df_insure_reg_premium_s, df_insure_cha_premium_s, df_region_cd_p,  df_policy_cnt ), "fe_2.pkl")

['fe_2.pkl']