[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DeutscheAktuarvereinigung/Python_fuer_Aktuare/blob/main/examples/ClaimPrediction/insurance-claim-prediction-notebook.ipynb) [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/DeutscheAktuarvereinigung/Python_fuer_Aktuare/blob/main/examples/ClaimPrediction/insurance-claim-prediction-notebook.ipynb)

In [None]:
# Import the required libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
#warnings.filterwarnings('ignore')

In [None]:
# set the display to 100 rows and columns

pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',100)

In [None]:
# Importing the Dataset 

#train_df = pd.read_csv('./train.csv')
#test_df = pd.read_csv('./test.csv')
train_df = pd.read_csv('https://raw.githubusercontent.com/DeutscheAktuarvereinigung/Python_fuer_Aktuare/refs/heads/main/examples/ClaimPrediction/train.csv')
test_df = pd.read_csv('https://raw.githubusercontent.com/DeutscheAktuarvereinigung/Python_fuer_Aktuare/refs/heads/main/examples/ClaimPrediction/test.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

## Data Insights and EDA

In [None]:
# Printing the shape of the train and test dataset

train_df.shape, test_df.shape

In [None]:
# View the summary of the dataset

train_df.info()

### Comments:

- This info reflects that the dataset has no null values.
- There are 28 Categorical features and 16 numerical features.

In [None]:
# let's check for missing values:

train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

### Comments:
- The dataset has no missing values.

In [None]:
# check for duplicate values:

train_df.duplicated().sum()

### Comments:
- We can see there is no duplicate data in the dataset.

In [None]:
#View the statistical summary of numerical variables

train_df.describe()

In [None]:
#categorical features
categorical = train_df.select_dtypes(include =[object])
print("Categorical Features in DataSet:",categorical.shape[1])
print(categorical.columns)

#numerical features
numerical= train_df.select_dtypes(include =[np.float64,np.int64])
print("Numerical Features in DataSet:",numerical.shape[1])
print(numerical.columns)

In [None]:
train_df.is_claim.value_counts()

- We have imbalanced data in Target column. We will handle that before feed to model.

## Univariate Analysis

### - Numerical Features

In [None]:
target = [i for i in numerical.columns]
plt.figure(figsize=(10,15))
for n,column in enumerate(target):
    plot=plt.subplot(8,2,n+1)
    sns.histplot(train_df[column],color='green')
    plt.title(f'{column.title()}',weight='bold')
    plt.tight_layout()

### - Categorical Features

In [None]:
categorical=categorical.drop('policy_id',axis=1)

target = [i for i in categorical.columns]
plt.figure(figsize=(15,25))
for n,column in enumerate(target):
    plot=plt.subplot(14,2,n+1)
    sns.countplot(train_df, x= column)
    plt.title(f'{column.title()}',weight='bold')
    plt.tight_layout()

In [None]:
train_cpy = train_df.copy()
test_cpy = test_df.copy()

In [None]:
train_cpy.drop(['policy_id'], axis=1, inplace=True)
test_cpy.drop(['policy_id'], axis=1, inplace=True)

In [None]:
ordinal_col = ['max_torque', 'max_power', 'transmission_type', 'steering_type']

train_cpy['transmission_type'] = train_cpy['transmission_type'].replace({'Manual' : 1, 'Automatic' : 2})
train_cpy['steering_type'] = train_cpy['steering_type'].replace({'Manual' : 1, 'Power' : 2, 'Electric': 3})

train_cpy[['max_torque_Nm', 'max_torque_rpm']] = train_cpy["max_torque"].apply(lambda x: pd.Series(str(x).split("@")))
train_cpy.drop(["max_torque"], axis=1, inplace= True)
train_cpy['max_torque_Nm'] = train_cpy['max_torque_Nm'].str[:-2].astype(float)
train_cpy['max_torque_rpm'] = train_cpy['max_torque_rpm'].str[:-3].astype(int)



train_cpy[['max_power_bhp', 'max_power_rpm']] = train_cpy["max_power"].apply(lambda x: pd.Series(str(x).split("@")))
train_cpy.drop(["max_power"], axis=1, inplace= True)
train_cpy['max_power_rpm'] = train_cpy['max_power_rpm'].str[:-3].astype(int)
train_cpy['max_power_bhp'] = train_cpy['max_power_bhp'].str[:-3].astype(float)

In [None]:
test_cpy['transmission_type'] = test_cpy['transmission_type'].replace({'Manual' : 1, 'Automatic' : 2})
test_cpy['steering_type'] = test_cpy['steering_type'].replace({'Manual' : 1, 'Power' : 2, 'Electric': 3})

test_cpy[['max_torque_Nm', 'max_torque_rpm']] = test_cpy["max_torque"].apply(lambda x: pd.Series(str(x).split("@")))
test_cpy.drop(["max_torque"], axis=1, inplace= True)
test_cpy['max_torque_Nm'] = test_cpy['max_torque_Nm'].str[:-2].astype(float)
test_cpy['max_torque_rpm'] = test_cpy['max_torque_rpm'].str[:-3].astype(int)



test_cpy[['max_power_bhp', 'max_power_rpm']] = test_cpy["max_power"].apply(lambda x: pd.Series(str(x).split("@")))
test_cpy.drop(["max_power"], axis=1, inplace= True)
test_cpy['max_power_rpm'] = test_cpy['max_power_rpm'].str[:-3].astype(int)
test_cpy['max_power_bhp'] = test_cpy['max_power_bhp'].str[:-3].astype(float)

In [None]:
train_cpy = pd.get_dummies(train_cpy, drop_first=True)
test_cpy = pd.get_dummies(test_cpy,drop_first=True)

In [None]:
X = train_cpy.drop(['is_claim'], axis=1)
y = train_cpy['is_claim']
X_test = test_cpy.copy()

policy_id = test_df['policy_id']

In [None]:
## SMOTE to handle imbalanced data

from imblearn.over_sampling import SMOTE

sm =  SMOTE(random_state=12, sampling_strategy=0.8)

X_sm, y_sm =  sm.fit_resample(X,y)

X_sm.shape, y_sm.shape

from collections import Counter

print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_sm)))

## Train and Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X_sm,y_sm,test_size = 0.2, random_state = 42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score,f1_score

## Model Building
Takes some time, approx 1 - 2 minutes.

In [None]:
dt=RandomForestClassifier(n_estimators=1000,
                         criterion='gini',
                         max_depth=12,
                         max_features='log2',
                         min_samples_leaf=1,
                         min_samples_split=5,
                         random_state=42)

# Train Model
dt.fit(X_train,y_train)

# Training set performance
train_accuracy= dt.score(X_train,y_train)

#Testing set performance
test_accuracy=dt.score(X_test,y_test)


print('Accuracy for Training set is')
print( 100*train_accuracy)
print('----------------------------------')
print('Accuracy for Testing set is')
print( 100*test_accuracy)

In [None]:
pred = dt.predict(X_test)

## Model Evaluation

In [None]:
from sklearn.metrics import classification_report

print(f1_score(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

## Predict Test Data

In [None]:
y_pred=dt.predict(X_test)

In [None]:
y_pred
