# Insurance Prediction

In [110]:
# python libraries
import pandas as pd
import numpy as np
import plotly.express as px

# ml libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# close warning library
import warnings
warnings.filterwarnings("ignore")


In [111]:
# read dataset
data = pd.read_csv("C:/Users/HAZAL/OneDrive/Masaüstü/Projeler/travel_insurance_prediction/TravelInsurancePrediction.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,0,31,Government Sector,Yes,400000,6,1,No,No,0
1,1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1
3,3,28,Private Sector/Self Employed,Yes,700000,3,1,No,No,0
4,4,28,Private Sector/Self Employed,Yes,700000,8,1,Yes,No,0


In [112]:
# We remove unnecessary columns
data.drop(columns=["Unnamed: 0"], inplace=True)

In [113]:
# We check if there are any missing values.
data.isnull().sum()

# If there was a missing value we would do this
data = data.dropna()

Age                    0
Employment Type        0
GraduateOrNot          0
AnnualIncome           0
FamilyMembers          0
ChronicDiseases        0
FrequentFlyer          0
EverTravelledAbroad    0
TravelInsurance        0
dtype: int64

In [98]:
# check the data structure
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1987 non-null   int64 
 1   Employment Type      1987 non-null   object
 2   GraduateOrNot        1987 non-null   object
 3   AnnualIncome         1987 non-null   int64 
 4   FamilyMembers        1987 non-null   int64 
 5   ChronicDiseases      1987 non-null   int64 
 6   FrequentFlyer        1987 non-null   object
 7   EverTravelledAbroad  1987 non-null   object
 8   TravelInsurance      1987 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 139.8+ KB


In [99]:
# Selects only the columns in data with string data type and prints the unique values of each column.

for col in data.select_dtypes(include='object').columns:
    print(col) # prints the column name that has the object type
    print(data[col].unique()) # prints the unique values of the column that has the object type

Employment Type
['Government Sector' 'Private Sector/Self Employed']
GraduateOrNot
['Yes' 'No']
FrequentFlyer
['No' 'Yes']
EverTravelledAbroad
['No' 'Yes']


In [100]:
# We plot a histogram to see how age affects the purchase of insurance policies
figure = px.histogram(data,
                     x = "Age",
                     color="TravelInsurance",
                     title= "Factors Affecting Purchase of Travel Insurance: Age")
figure.show()

In [101]:
# We see how a person's type of employment affects the purchase of an insurance policy
figure = px.histogram(data,
                      x = "Employment Type",
                      color = "TravelInsurance",
                      title= "Factors Affecting Purchase of Travel Insurance: Employment Type")
figure.show()

In [102]:
# We see how a person's annual income affects the purchase of an insurance policy.
figure = px.histogram(data,
                      x = "AnnualIncome",
                      color = "TravelInsurance",
                      title= "Factors Affecting Purchase of Travel Insurance: Income")
figure.show()

In [114]:
# We write the values of Object type variables as binary, that is, we convert categorical data into nominal data. This is called One Hot Encoding.data["GraduateOrNot"] = data["GraduateOrNot"].map({"No": 0, "Yes": 1})
data["GraduateOrNot"] = data["GraduateOrNot"].map({"No": 0, "Yes": 1})
data["FrequentFlyer"] = data["FrequentFlyer"].map({"No": 0, "Yes": 1})
data["EverTravelledAbroad"] = data["EverTravelledAbroad"].map({"No": 0, "Yes": 1})
data["Employment Type"] = data["Employment Type"].map({"Government Sector": 0, "Private Sector/Self Employed": 1})

In [115]:
# We separate dependent and independent variables.
y = np.array(data["TravelInsurance"]) # Dependent variable
x = np.array(data.drop(["TravelInsurance"], axis=1)) # Independent variables

In [117]:
# We separate dependent and independent variables as train and test.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=42)

# To make our model more successful, we put them all on the same scale, that is, we perform normalization.
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [119]:
# Machine learning
model = DecisionTreeClassifier(max_depth=3) # The model is created.
model.fit(x_train, y_train) # We train the model. The machine will learn y_train by looking at the data in x_train.
predictions = model.predict(x_test) #  It is the stage of testing whether the machine has learned or not. Predicts y_test from x_test.

In [107]:
# Model performance
accuracy = accuracy_score(y_test, predictions) # Calculates the accuracy of the model.
# Generates a comprehensive report. 
# It is used to further evaluate the performance between actual labels and model predictions.
# The report includes metrics such as accuracy, precision, sensitivity, and F1 score for each class.
report = classification_report(y_test, predictions) 
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.8542713567839196
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.99      0.90       135
           1       0.97      0.56      0.71        64

    accuracy                           0.85       199
   macro avg       0.90      0.78      0.81       199
weighted avg       0.87      0.85      0.84       199

