# Travel Insurance predictions

## Import libraries

In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Prepare the dataset

In [89]:
# Loading the dataset
DATA_DIR = '../_data'
FILE_NAME = 'travel_insurance.csv'
data_path = os.path.join(DATA_DIR, FILE_NAME)
insurance = pd.read_csv(data_path, index_col=None)

# Creating creating binary features
# insurance['HasClaim'] = (insurance['Claim'] == 'Yes').astype('int')

## Explore the data structure

In [90]:
insurance.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.7,,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,,41


In [91]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63326 entries, 0 to 63325
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Agency                63326 non-null  object 
 1   Agency Type           63326 non-null  object 
 2   Distribution Channel  63326 non-null  object 
 3   Product Name          63326 non-null  object 
 4   Claim                 63326 non-null  object 
 5   Duration              63326 non-null  int64  
 6   Destination           63326 non-null  object 
 7   Net Sales             63326 non-null  float64
 8   Commision (in value)  63326 non-null  float64
 9   Gender                18219 non-null  object 
 10  Age                   63326 non-null  int64  
dtypes: float64(2), int64(2), object(7)
memory usage: 5.3+ MB


In [92]:
insurance.describe()

Unnamed: 0,Duration,Net Sales,Commision (in value),Age
count,63326.0,63326.0,63326.0,63326.0
mean,49.317074,40.702018,9.809992,39.969981
std,101.791566,48.845637,19.804388,14.01701
min,-2.0,-389.0,0.0,0.0
25%,9.0,18.0,0.0,35.0
50%,22.0,26.53,0.0,36.0
75%,53.0,48.0,11.55,43.0
max,4881.0,810.0,283.5,118.0


### Check for missing values

In [93]:
insurance.isnull().sum()

Agency                      0
Agency Type                 0
Distribution Channel        0
Product Name                0
Claim                       0
Duration                    0
Destination                 0
Net Sales                   0
Commision (in value)        0
Gender                  45107
Age                         0
dtype: int64

#### Handle missing values

In [94]:
insurance.fillna(value={'Gender': 'Unknown'}, inplace=True)  # Replace missing values in 'Gender' column with 'Unknown'

#### Encode categorical variables

In [95]:
from sklearn.preprocessing import LabelEncoder

# Creating creating binary features
insurance['Claim'] = (insurance['Claim'] == 'Yes').astype('int')

label_encoder = LabelEncoder()
insurance['Gender'] = label_encoder.fit_transform(insurance['Gender'])  # Encode 'Gender' column as numerical

insurance.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,-29.0,9.57,0,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,-29.0,9.57,0,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,65,AUSTRALIA,-49.5,29.7,2,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,60,AUSTRALIA,-39.6,23.76,2,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,79,ITALY,-19.8,11.88,2,41


## Prepare data for modeling

### Split the datset 

In [96]:
# Perform one-hot encoding on categorical columns
categorical_cols = ['Agency', 'Agency Type', 'Distribution Channel', 'Product Name', 'Destination']
insurance_encoded = pd.get_dummies(insurance, columns=categorical_cols)

# Split the data into features (X - input variables) and target (y)
# X = insurance.drop('Claim', axis=1)
# y = insurance['Claim']
X = insurance_encoded.drop('Claim', axis=1)
y = insurance_encoded['Claim']

from sklearn.model_selection import train_test_split
# 80-20 split
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## Classification Model

### Random Forest

In [99]:
# fit random forest model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_train)

# look at the training accuracy score
from sklearn.metrics import accuracy_score
accuracy_rf = accuracy_score(y_true=y_train, y_pred=y_pred_rf)
print('Accuracy:', accuracy_rf)

Accuracy: 0.9983418870904066
