<a href="https://colab.research.google.com/github/Mark-Barbaric/Kaggle/blob/AP-39-Train-different-classifier-models-for-Binary-Classification-Kaggle-Comp/binary_classification_of_insurance_selling%20/bc_insurance_selling_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e7:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F73291%2F8930475%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240729%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240729T073119Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5c6517b2aff4e77d7fbda9ed5122ef8aefd686572c315e1a7b79465816002fe8cf1ce4ed13137f9b83772d04851018388df101d05c20652a53d66588df2925dcd22a87cd896a9d7ead4efab51839bbd0203cd7204c88fdcb14da9f460bb882f7cecfc8188426c5837bb548bbfe7c127dfd1e68de2b7d0671f72747df365c9160cef31bdd2bd8f9766666ce6b46215d679e62599d158137662a4659233689481d02e98ba9ff69a142d4aa42b3a4e4f6ec883f349726c989b46e48db13947a15a3f1248391ddad952982c915851265aaf5c203a92ef20d403b8172b10d090e949f71ffe097534939248d52565bc9af0065ec19b1cf437bacb2008f2fc7a073e491'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading playground-series-s4e7, 259402906 bytes compressed
Data source import complete.


# Binary Classification of Insurance Selling - Model Selection

The aim of this workbook is to build on the previous workbook in order to improve the roc_auc score achieved.

url: https://www.kaggle.com/competitions/playground-series-s4e7/overview

I intend to train multiple different models using the same preprocessing and upsampling techniques outlined in the first workbook.

# Lib Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
import scipy.stats as stats

In [3]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

In [4]:
import os

TRAIN_DATASET_DIR = '/kaggle/input/playground-series-s4e7/train.csv' if os.path.exists('/kaggle/input/') else 'train.csv'
TEST_DATASET_DIR = '/kaggle/input/playground-series-s4e7/test.csv' if os.path.exists('/kaggle/input/') else 'test.csv'

print(f"train dataset dir: {TRAIN_DATASET_DIR}")
print(f"test dataset dir: {TEST_DATASET_DIR}")

train dataset dir: /kaggle/input/playground-series-s4e7/train.csv
test dataset dir: /kaggle/input/playground-series-s4e7/test.csv


In [5]:
RANDOM_STATE = 32
Y_COLUMN = 'Response'
TEST_SIZE = 0.25

In [6]:
def preprocess_dataframe(df):
    df['Gender'] = df['Gender'].apply(lambda x: 0 if x == 'Male' else 1)
    df['Vehicle_Damage'] = df['Vehicle_Damage'].apply(lambda x: 1 if x == 'Yes' else 0)
    vehicle_age_one_hot = pd.get_dummies(df['Vehicle_Age'])
    df[vehicle_age_one_hot.columns] = vehicle_age_one_hot.values.astype(int)
    df.drop(['Vehicle_Age', 'id'], axis=1, inplace=True)

## 1. Data Loading and Preprocessing

In [7]:
train_df = dd.read_csv(TRAIN_DATASET_DIR)
train_df = train_df.compute()
train_df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [8]:
test_df = dd.read_csv(TEST_DATASET_DIR)
test_df = test_df.compute()
test_df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
1,11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123
2,11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271
3,11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115
4,11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148


In [9]:
preprocess_dataframe(train_df)
train_df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,1-2 Year,< 1 Year,> 2 Years
0,0,21,1,35.0,0,1,65101.0,124.0,187,0,1,0,0
1,0,43,1,28.0,0,1,58911.0,26.0,288,1,0,0,1
2,1,25,1,14.0,1,0,38043.0,152.0,254,0,0,1,0
3,1,35,1,1.0,0,1,2630.0,156.0,76,0,1,0,0
4,1,36,1,15.0,1,0,31951.0,152.0,294,0,1,0,0


## 2. Upsampling and Train Test Split

In [10]:
y = train_df[Y_COLUMN]
X = train_df.drop(Y_COLUMN, axis=1)

In [11]:
X.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,1-2 Year,< 1 Year,> 2 Years
0,0,21,1,35.0,0,1,65101.0,124.0,187,1,0,0
1,0,43,1,28.0,0,1,58911.0,26.0,288,0,0,1
2,1,25,1,14.0,1,0,38043.0,152.0,254,0,1,0
3,1,35,1,1.0,0,1,2630.0,156.0,76,1,0,0
4,1,36,1,15.0,1,0,31951.0,152.0,294,1,0,0


In [12]:
smote_sampler = SMOTE(random_state=RANDOM_STATE, n_jobs=-1)
X_smo, y_smo = smote_sampler.fit_resample(X, y)



In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_smo, y_smo, test_size=TEST_SIZE, random_state=RANDOM_STATE)

## 3. Preprocessor Definition

In [14]:
NUMERICAL_COLS = [col for col in train_df.columns if train_df[col].dtype != 'object' and col != Y_COLUMN]

In [15]:
numerical_transformer = Pipeline(
    steps=[
        ('ss', StandardScaler())
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, NUMERICAL_COLS)
    ],
    remainder='passthrough'
)

## 4. Model Training

In [16]:
xgb_classifier = XGBClassifier(random_state=RANDOM_STATE, subsample=0.5, max_depth=10, learning_rate=0.01, n_estimators=150)

In [17]:
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('xgb_classifier', xgb_classifier)
    ])

In [18]:
pipeline.fit(X_train, y_train)

In [19]:
y_pred = pipeline.predict(X_test)

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.75      0.82   2521970
           1       0.78      0.92      0.85   2522900

    accuracy                           0.83   5044870
   macro avg       0.84      0.83      0.83   5044870
weighted avg       0.84      0.83      0.83   5044870



In [21]:
roc_auc_score(y_test, y_pred)

0.8317330825407419