# Binary Classification of Insurance Selling - Model Selection

The aim of this workbook is to build on the previous workbook in order to improve the roc_auc score achieved.

url: https://www.kaggle.com/competitions/playground-series-s4e7/overview

I intend to train multiple different models using the same preprocessing and upsampling techniques outlined in the first workbook.

# Lib Imports

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

In [12]:
import os

TRAIN_DATASET_DIR = '/kaggle/input/playground-series-s4e7/train.csv' if os.path.exists('/kaggle/input/') else 'train.csv'
TEST_DATASET_DIR = '/kaggle/input/playground-series-s4e7/test.csv' if os.path.exists('/kaggle/input/') else 'test.csv'

print(f"train dataset dir: {TRAIN_DATASET_DIR}")
print(f"test dataset dir: {TEST_DATASET_DIR}")

train dataset dir: /kaggle/input/playground-series-s4e7/train.csv
test dataset dir: /kaggle/input/playground-series-s4e7/test.csv


In [18]:
RANDOM_STATE = 32
Y_COLUMN = 'Response'
TEST_SIZE = 0.25

In [13]:
def preprocess_dataframe(df):
    df['Gender'] = df['Gender'].apply(lambda x: 0 if x == 'Male' else 1)
    df['Vehicle_Damage'] = df['Vehicle_Damage'].apply(lambda x: 1 if x == 'Yes' else 0)
    vehicle_age_one_hot = pd.get_dummies(df['Vehicle_Age'])
    df[vehicle_age_one_hot.columns] = vehicle_age_one_hot.values.astype(int)
    df.drop(['Vehicle_Age', 'id'], axis=1, inplace=True)

## 1. Data Loading and Preprocessing

In [14]:
train_df = dd.read_csv(TRAIN_DATASET_DIR)
train_df = train_df.compute()
train_df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [15]:
test_df = dd.read_csv(TEST_DATASET_DIR)
test_df = test_df.compute()
test_df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
1,11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123
2,11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271
3,11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115
4,11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148


In [16]:
preprocess_dataframe(train_df)
train_df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,1-2 Year,< 1 Year,> 2 Years
0,0,21,1,35.0,0,1,65101.0,124.0,187,0,1,0,0
1,0,43,1,28.0,0,1,58911.0,26.0,288,1,0,0,1
2,1,25,1,14.0,1,0,38043.0,152.0,254,0,0,1,0
3,1,35,1,1.0,0,1,2630.0,156.0,76,0,1,0,0
4,1,36,1,15.0,1,0,31951.0,152.0,294,0,1,0,0


## 2. Upsampling and Train Test Split

In [19]:
y = train_df[Y_COLUMN]
X = train_df.drop(Y_COLUMN, axis=1)

In [20]:
smote_sampler = SMOTE(random_state=RANDOM_STATE)
X_smo, y_smo = smote_sampler.fit_resample(X, y)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_smo, y_smo, test_size=TEST_SIZE, random_state=RANDOM_STATE)

## 3. Preprocessor Definition