In [1]:
import pandas as pd
import numpy as np

In [25]:
df = pd.read_csv("travel-insurance.csv") #read the file
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,0,31,Government Sector,Yes,400000,6,1,No,No,0
1,1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1
3,3,28,Private Sector/Self Employed,Yes,700000,3,1,No,No,0
4,4,28,Private Sector/Self Employed,Yes,700000,8,1,Yes,No,0


In [8]:
df.info() #check the missing values, data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           1987 non-null   int64 
 1   Age                  1987 non-null   int64 
 2   Employment Type      1987 non-null   object
 3   GraduateOrNot        1987 non-null   object
 4   AnnualIncome         1987 non-null   int64 
 5   FamilyMembers        1987 non-null   int64 
 6   ChronicDiseases      1987 non-null   int64 
 7   FrequentFlyer        1987 non-null   object
 8   EverTravelledAbroad  1987 non-null   object
 9   TravelInsurance      1987 non-null   int64 
dtypes: int64(6), object(4)
memory usage: 155.4+ KB


Model training

In [10]:
df_copy = df.copy()

In [24]:
# Get the list of columns (features) of the categorical (object, string) type.
cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
# Convert data features to dummy varables
data = pd.get_dummies(data=df_copy, columns=cat_cols)
data

Unnamed: 0.1,Unnamed: 0,Age,AnnualIncome,FamilyMembers,ChronicDiseases,TravelInsurance,Employment Type_Government Sector,Employment Type_Private Sector/Self Employed,GraduateOrNot_No,GraduateOrNot_Yes,FrequentFlyer_No,FrequentFlyer_Yes,EverTravelledAbroad_No,EverTravelledAbroad_Yes
0,0,31,400000,6,1,0,1,0,0,1,1,0,1,0
1,1,31,1250000,7,0,0,0,1,0,1,1,0,1,0
2,2,34,500000,4,1,1,0,1,0,1,1,0,1,0
3,3,28,700000,3,1,0,0,1,0,1,1,0,1,0
4,4,28,700000,8,1,0,0,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1982,1982,33,1500000,4,0,1,0,1,0,1,0,1,0,1
1983,1983,28,1750000,5,1,0,0,1,0,1,1,0,0,1
1984,1984,28,1150000,6,1,0,0,1,0,1,1,0,1,0
1985,1985,34,1000000,6,0,1,0,1,0,1,0,1,0,1


In [42]:
feature_data = data.columns.tolist()     # Get the list of data features from column names
feature_data.remove("TravelInsurance")         # Remove the label column "TravelInsurance" from the data features
X = data[feature_data].values 

y = data.TravelInsurance.values  

In [43]:
X.shape

(1987, 13)

In [28]:
y.shape

(1987,)

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

Scenario 1: With StandardScaler

In [39]:
# Standardize the data using Standard scaler
from sklearn.preprocessing import StandardScaler
normalizer = StandardScaler()
X_normal_train = normalizer.fit_transform(X_train)     
X_normal_test = normalizer.transform(X_test) 

In [40]:
# Initialize and train Gaussian Naive Bayes model using X_normal_train (data features) and y_train (data label)
from sklearn.naive_bayes import GaussianNB
naive_model = GaussianNB()
naive_model.fit(X_normal_train, y_train)

GaussianNB()

In [41]:
# Impport libraries to calculate evaluation metrics: precision, recall, f1 score.
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Make prediction on the test data
predicted_label = naive_model.predict(X_normal_test)

# Calculate evaluation metrics by comparing the prediction with the data label y_test
print(precision_score(predicted_label, y_test))
print(recall_score(predicted_label, y_test))
print(f1_score(predicted_label, y_test))
print(classification_report(predicted_label, y_test))

0.5514018691588785
0.6941176470588235
0.6145833333333333
              precision    recall  f1-score   support

           0       0.86      0.78      0.82       427
           1       0.55      0.69      0.61       170

    accuracy                           0.75       597
   macro avg       0.71      0.73      0.72       597
weighted avg       0.78      0.75      0.76       597



Scenario 2: Without StandardScaler

In [33]:
naive_model = GaussianNB()
naive_model.fit(X_train, y_train)

GaussianNB()

In [34]:
predicted_label = naive_model.predict(X_test)

# Calculate evaluation metrics by comparing the prediction with the data label y_test
print(precision_score(predicted_label, y_test))
print(recall_score(predicted_label, y_test))
print(f1_score(predicted_label, y_test))
print(classification_report(predicted_label, y_test))

0.5373831775700935
0.7718120805369127
0.6336088154269972
              precision    recall  f1-score   support

           0       0.91      0.78      0.84       448
           1       0.54      0.77      0.63       149

    accuracy                           0.78       597
   macro avg       0.72      0.78      0.74       597
weighted avg       0.82      0.78      0.79       597

