# 0. IMPORTS

Imports of essencial python libraries, the dataset and helper functions

## 0.1. Python Libraries

In [1]:
import inflection

import pandas     as pd
import numpy      as np
import seaborn    as sns
sns.set_style('darkgrid')
import scikitplot as skplt

from matplotlib        import pyplot          as plt
from matplotlib.pyplot import figure
from sklearn           import preprocessing   as pp
from sklearn           import model_selection as ms
from sklearn           import ensemble        as en
from sklearn           import neighbors       as nh
from sklearn           import linear_model    as lm

## 0.2. Loading the [Dataset](https://www.kaggle.com/datasets/anmolkumar/health-insurance-cross-sell-prediction?select=test.csv)

In [2]:
df_raw = pd.read_csv('dataset/train.csv')
df_raw.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


# 1. DATA DESCRIPTION

This section contains relavant informations about the dataset.

In [3]:
df1 = df_raw.copy()

In [5]:
cols_old = ['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
            'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
            'Policy_Sales_Channel', 'Vintage', 'Response']

snakecase = lambda x: inflection.underscore( x )

cols_new = list( map( snakecase, cols_old ) )

df1.columns = cols_new

## 1.1. Data Preview

In [6]:
df1.head()

Unnamed: 0,id,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,vintage,response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


## 1.2. Data Dimension and Data Types

In [7]:
print( 'Number of Rows: {}'.format( df1.shape[0] ) )
print( 'Number of Columns: {}'.format( df1.shape[1] ) )

Number of Rows: 381109
Number of Columns: 12


In [8]:
df1.dtypes

id                        int64
gender                   object
age                       int64
driving_license           int64
region_code             float64
previously_insured        int64
vehicle_age              object
vehicle_damage           object
annual_premium          float64
policy_sales_channel    float64
vintage                   int64
response                  int64
dtype: object

In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   gender                381109 non-null  object 
 2   age                   381109 non-null  int64  
 3   driving_license       381109 non-null  int64  
 4   region_code           381109 non-null  float64
 5   previously_insured    381109 non-null  int64  
 6   vehicle_age           381109 non-null  object 
 7   vehicle_damage        381109 non-null  object 
 8   annual_premium        381109 non-null  float64
 9   policy_sales_channel  381109 non-null  float64
 10  vintage               381109 non-null  int64  
 11  response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


## 1.3. Check NA

In [10]:
df1.isna().sum()

id                      0
gender                  0
age                     0
driving_license         0
region_code             0
previously_insured      0
vehicle_age             0
vehicle_damage          0
annual_premium          0
policy_sales_channel    0
vintage                 0
response                0
dtype: int64

## 1.4. Data Descriptive

There are 2 types of data:

- Numerical data: numbers
- Categorical data: everything else

In [11]:
num_attributes = df1.select_dtypes( include=['int64', 'float64'] )
cat_attributes = df1.select_dtypes( exclude=['int64', 'float64', 'datetime64[ns]'] )

# Central Tendency - mean, median
ct1 = pd.DataFrame( num_attributes.apply( np.mean ) ).T
ct2 = pd.DataFrame( num_attributes.apply( np.median ) ).T

# Dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply( np.std ) ).T
d2 = pd.DataFrame( num_attributes.apply( min ) ).T
d3 = pd.DataFrame( num_attributes.apply( max ) ).T
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max()-x.min() ) ).T
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T

# concatenate
m = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6] ).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']

m

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,id,1.0,381109.0,381108.0,190555.0,190555.0,110016.69187,9.443274e-16,-1.2
1,age,20.0,85.0,65.0,38.822584,36.0,15.511591,0.672539,-0.565655
2,driving_license,0.0,1.0,1.0,0.997869,1.0,0.046109,-21.59518,464.354302
3,region_code,0.0,52.0,52.0,26.388807,28.0,13.229871,-0.1152664,-0.867857
4,previously_insured,0.0,1.0,1.0,0.45821,0.0,0.498251,0.1677471,-1.971871
5,annual_premium,2630.0,540165.0,537535.0,30564.389581,31669.0,17213.132474,1.766087,34.004569
6,policy_sales_channel,1.0,163.0,162.0,112.034295,133.0,54.203924,-0.9000081,-0.97081
7,vintage,10.0,299.0,289.0,154.347397,154.0,83.671194,0.003029517,-1.200688
8,response,0.0,1.0,1.0,0.122563,0.0,0.327935,2.301906,3.298788


In [12]:
np.round(df1.describe().T, 2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,381109.0,190555.0,110016.84,1.0,95278.0,190555.0,285832.0,381109.0
age,381109.0,38.82,15.51,20.0,25.0,36.0,49.0,85.0
driving_license,381109.0,1.0,0.05,0.0,1.0,1.0,1.0,1.0
region_code,381109.0,26.39,13.23,0.0,15.0,28.0,35.0,52.0
previously_insured,381109.0,0.46,0.5,0.0,0.0,0.0,1.0,1.0
annual_premium,381109.0,30564.39,17213.16,2630.0,24405.0,31669.0,39400.0,540165.0
policy_sales_channel,381109.0,112.03,54.2,1.0,29.0,133.0,152.0,163.0
vintage,381109.0,154.35,83.67,10.0,82.0,154.0,227.0,299.0
response,381109.0,0.12,0.33,0.0,0.0,0.0,0.0,1.0


In [13]:
cat_attributes.apply( lambda x: x.unique().shape[0] )

gender            2
vehicle_age       3
vehicle_damage    2
dtype: int64

In [14]:
df1.to_csv('dataset/df1.csv', index=False)

# 2. FEATURE ENGINEERING

Feature engineering refers to manipulation ( addition, deletion, combination, mutation ) of your data set to improve machine learning model training, leading to better performance and greater accuracy.

3 manipulations were done:
1. 'vehicle_age' column modified
2. 'vehicle_damage' column modified
3. Division of our dataset into 3 parts: train, validation and test
<br>


- Training Dataset (70%): Its purpose is to train the machine learn models 
- Validation Dataset (20%): Used to adjust the hyper parameters
- Test Dataset (10%): Evaluate how effective the models are

In [15]:
df2 = pd.read_csv('dataset/df1.csv')

## 2.1. Vehicle Age

In [16]:
df2['vehicle_age'] = df2['vehicle_age'].apply( lambda x: 'over_2_years'     if x == '> 2 Years' else 
                                                         'between_1_2_year' if x == '1-2 Year'  else
                                                         'below_1_year')

## 2.2. Vehicle Damage

In [17]:
df2['vehicle_damage'] = df2['vehicle_damage'].apply( lambda x: 1 if x == 'Yes' else 0 )

In [18]:
X = df2.drop( 'response', axis=1 )
y = df2['response'].copy()

# Split the dataset into REMAINING and TEST
X_remaining, X_test, y_remaining, y_test = ms.train_test_split( X, y, test_size=0.10, stratify=y, shuffle=True )

df_test = pd.concat( [X_test, y_test], axis=1 ) # test dataset

# Split REMAINING into TRAIN and VALIDATION
X_train, X_val, y_train, y_val = ms.train_test_split( X_remaining, y_remaining, test_size=0.222, stratify=y_remaining, shuffle=True )

df2 = pd.concat( [X_train, y_train], axis=1 ) # train dataset

df_val = pd.concat( [X_val, y_val], axis=1 ) # validation dataset

In [19]:
total = df2.shape[0] + df_val.shape[0] + df_test.shape[0]

print( 'TRAIN      DATASET | Number of Rows: {}, Number of Columns: {}'.format( df2.shape[0], df2.shape[1] ) )
print( 'VALIDATION DATASET | Number of Rows: {} , Number of Columns: {}'.format( df_val.shape[0], df_val.shape[1] ) )
print( 'TEST       DATASET | Number of Rows: {} , Number of Columns: {}'.format( df_test.shape[0], df_test.shape[1] ) )

print('''\nTrain     : {} %
Validation: {} %
Test      : {}  %'''.format( round(100*df2.shape[0]/total,2), round(100*df_val.shape[0]/total,2), round(100*df_test.shape[0]/total,2) ) )

TRAIN      DATASET | Number of Rows: 266852, Number of Columns: 12
VALIDATION DATASET | Number of Rows: 76146 , Number of Columns: 12
TEST       DATASET | Number of Rows: 38111 , Number of Columns: 12

Train     : 70.02 %
Validation: 19.98 %
Test      : 10.0  %


In [20]:
# train 
df2.to_csv('dataset/df2.csv', index=False)

# val
df_val.to_csv('dataset/df_val.csv', index=False)

# test
df_test.to_csv('dataset/df_test.csv', index=False)