# Load the dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("travel-insurance.csv")
df = df.drop(columns=["Unnamed: 0"], axis=1)
df.head()

Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,31,Government Sector,Yes,400000,6,1,No,No,0
1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1
3,28,Private Sector/Self Employed,Yes,700000,3,1,No,No,0
4,28,Private Sector/Self Employed,Yes,700000,8,1,Yes,No,0


# Analyze the dataset

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1987 non-null   int64 
 1   Employment Type      1987 non-null   object
 2   GraduateOrNot        1987 non-null   object
 3   AnnualIncome         1987 non-null   int64 
 4   FamilyMembers        1987 non-null   int64 
 5   ChronicDiseases      1987 non-null   int64 
 6   FrequentFlyer        1987 non-null   object
 7   EverTravelledAbroad  1987 non-null   object
 8   TravelInsurance      1987 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 139.8+ KB


In [4]:
# Preprocess data, determine feature x and label y
feature_names = df.columns.tolist()     
feature_names.remove("TravelInsurance")       
feature_data = df[feature_names]         
target_data = df.TravelInsurance

In [5]:
# Label encode columns
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
feature_data['Employment Type'] = label_encoder.fit_transform(feature_data['Employment Type'])
feature_data['GraduateOrNot'] = label_encoder.fit_transform(feature_data['GraduateOrNot'])
feature_data['FrequentFlyer'] = label_encoder.fit_transform(feature_data['FrequentFlyer'])
feature_data['EverTravelledAbroad'] = label_encoder.fit_transform(feature_data['EverTravelledAbroad'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_data['Employment Type'] = label_encoder.fit_transform(feature_data['Employment Type'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_data['GraduateOrNot'] = label_encoder.fit_transform(feature_data['GraduateOrNot'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_data['Frequ

In [6]:
feature_data

Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad
0,31,0,1,400000,6,1,0,0
1,31,1,1,1250000,7,0,0,0
2,34,1,1,500000,4,1,0,0
3,28,1,1,700000,3,1,0,0
4,28,1,1,700000,8,1,1,0
...,...,...,...,...,...,...,...,...
1982,33,1,1,1500000,4,0,1,1
1983,28,1,1,1750000,5,1,0,1
1984,28,1,1,1150000,6,1,0,0
1985,34,1,1,1000000,6,0,1,1


In [7]:
feature_data.shape

(1987, 8)

In [8]:
target_data.shape

(1987,)

# Split the dataset into training, validation and testing set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature_data, target_data, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [10]:
from sklearn.preprocessing import StandardScaler
normalizer = StandardScaler()
X_normal_train = normalizer.fit_transform(X_train) 
X_normal_val = normalizer.transform(X_val)      
X_normal_test = normalizer.transform(X_test) 

# Train a Gaussian Naive Bayes model

In [11]:
# Initialize and train Gaussian Naive Bayes model using X_normal_train (data features) and y_train (data label)
from sklearn.naive_bayes import GaussianNB
naive_model = GaussianNB()
naive_model.fit(X_normal_train, y_train)

# Find the best set of hyperparameters

In [12]:
# Load the libraries
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from numpy import vstack, hstack, row_stack, concatenate

grid_search={"C":[0.01, 0.1, 1]} # Define the values of hyperparameter C we want to try
logmodel=LogisticRegression() # Initialize the logistic regression model
logmodel_cv=GridSearchCV(logmodel, grid_search, cv=5) # Set up GridSearchCV to find the best value of hyperparameter C, with 5-fold cross validation, i.e., cv=5.

# When using GridSearchCV, you do not need to have validation set.
# We only need to split the dataset into training and test set, then use the training set for GridSearchCV which will split into folds for training and validataion purpose.
X_normal_train_val, y_train_val = vstack((X_normal_train, X_normal_val)), concatenate((y_train, y_val))

 # Train the model using GridSearchCV
logmodel_cv.fit(X_normal_train_val, y_train_val)

In [13]:
logmodel_cv.best_params_ # Show the best value of C

{'C': 1}

In [14]:
logmodel_cv.best_score_ # Show the model performance with the best value of C

0.7671547328532032

In [15]:
# Now take the best hyperparameter (C=1) and test it on the test set to double check the accuracy
logmodel = LogisticRegression(C=logmodel_cv.best_params_['C'])               
logmodel.fit(X_normal_train, y_train)            
logmodel.score(X_normal_test, y_test)             # Test accuracy of the 'best' hyperparameter C=1

0.7788944723618091

# Train a Mixed Naive Bayes model

In [16]:
# Initialize categorical Naive Bayes model
from sklearn.naive_bayes import CategoricalNB
model = CategoricalNB()

# Train the model using X_train and y_train
model.fit(X_train, y_train)

validation_accuracy = model.score(X_val, y_val)
print("Validation Accuracy (Categorical NB):", validation_accuracy)

Validation Accuracy (Categorical NB): 0.7767295597484277


In [17]:
from mixed_naive_bayes import MixedNB

# Assuming the categorical features are at indices 1 and 2
categorical_features_indices = [1, 2]

# Encode categorical features
label_encoders = []
for feature_idx in categorical_features_indices:
    label_encoder = LabelEncoder()
    feature_column = feature_data.iloc[:, feature_idx]
    feature_column_encoded = label_encoder.fit_transform(feature_column)
    feature_data.iloc[:, feature_idx] = feature_column_encoded
    label_encoders.append(label_encoder)

# Use feature_data and target_data to train model
clf = MixedNB(categorical_features=categorical_features_indices)
clf.fit(feature_data.values, target_data)
predictions = clf.predict(feature_data.values)
print("Predictions:", predictions)
validation_accuracy = clf.score(X_val.values, y_val)
print("Validation Accuracy (Mixed NB):", validation_accuracy)


Predictions: [0 1 0 ... 0 0 0]
Validation Accuracy (Mixed NB): 0.7169811320754716


# Evaluate models

In [18]:
# Evaluate Gaussian Naive Bayes model performance
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
predicted_label = naive_model.predict(X_normal_test)
print(precision_score(predicted_label, y_test))
print(recall_score(predicted_label, y_test))
print(f1_score(predicted_label, y_test))
print(classification_report(predicted_label, y_test))

0.5379310344827586
0.7647058823529411
0.631578947368421
              precision    recall  f1-score   support

           0       0.91      0.77      0.83       296
           1       0.54      0.76      0.63       102

    accuracy                           0.77       398
   macro avg       0.72      0.77      0.73       398
weighted avg       0.81      0.77      0.78       398



- Precision (Customers who didn't buy travel insurance): This model correctly identified 91% of the customers who actually didn't buy the travel insurance package. So, when the model predicted a customer wouldn't buy the package, it was usually correct.
- Precision (Customers who bought travel insurance): However, only 54% of the customers predicted to buy the package actually did. This means there were more cases where the model mistakenly predicted a customer would buy the package when they didn't.
- Recall (Customers who didn't buy travel insurance): The model captured 77% of the customers who actually didn't buy the package. So, it correctly identified a good portion of these customers.
- Recall (Customers who bought travel insurance): For customers who did buy the package, the model captured 76% of them. This indicates that the model was fairly good at capturing those who did purchase the package.
- F1-score: The F1-scores for both scenarios were moderate, resulting in a weighted average of 0.78. This suggests overall **DECENT** performance, but there's room for improvement, especially in predicting purchases accurately

In [19]:
# Evaluate Categorical Naive Bayes model performance
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
predicted_label = model.predict(X_test)
print(precision_score(predicted_label, y_test))
print(recall_score(predicted_label, y_test))
print(f1_score(predicted_label, y_test))
print(classification_report(predicted_label, y_test))

0.4827586206896552
0.8974358974358975
0.6278026905829597
              precision    recall  f1-score   support

           0       0.97      0.77      0.86       320
           1       0.48      0.90      0.63        78

    accuracy                           0.79       398
   macro avg       0.73      0.83      0.74       398
weighted avg       0.87      0.79      0.81       398



- Precision (Customers who didn't buy travel insurance): The model achieved a high precision of 97% in identifying customers who didn't buy the travel insurance package. This indicates a low rate of false positives.
- Precision (Customers who bought travel insurance): However, only 48% of the customers predicted to buy the package actually did. This implies that there were quite a few cases where the model mistakenly predicted a purchase.
- Recall (Customers who didn't buy travel insurance): The model captured 77% of the customers who actually didn't buy the package. This indicates it was effective at identifying those customers.
- Recall (Customers who bought travel insurance): For customers who did buy the package, the model captured a high 90% of them. This suggests that the model was particularly good at capturing those who did purchase the package.
- F1-score: The F1-scores for both scenarios were moderate, resulting in a weighted average of 0.81. This indicates overall **GOOD** performance, especially in correctly identifying customers who bought the package.

In [20]:
# Evaluate Mixed Naive Bayes model performance
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
predicted_label = clf.predict(X_test)
print(precision_score(predicted_label, y_test))
print(recall_score(predicted_label, y_test))
print(f1_score(predicted_label, y_test))
print(classification_report(predicted_label, y_test))

0.5310344827586206
0.7264150943396226
0.6135458167330676
              precision    recall  f1-score   support

           0       0.89      0.77      0.82       292
           1       0.53      0.73      0.61       106

    accuracy                           0.76       398
   macro avg       0.71      0.75      0.72       398
weighted avg       0.79      0.76      0.77       398



- Precision (Customers who didn't buy travel insurance): This model correctly identified 89% of the customers who actually didn't buy the travel insurance package, showing a relatively high precision.
- Precision (Customers who bought travel insurance): However, only 53% of the customers predicted to buy the package actually did. This suggests there were cases where the model incorrectly predicted a purchase.
- Recall (Customers who didn't buy travel insurance): The model captured 77% of the customers who actually didn't buy the package, indicating it was effective at identifying them.
- Recall (Customers who bought travel insurance): For customers who did buy the package, the model captured 73% of them. This implies that the model missed some customers who actually purchased the package.
- F1-score: The F1-scores for both scenarios were moderate, resulting in a weighted average of 0.77. This indicates overall **MODERATE** performance, with room for improvement in accurately predicting both scenarios regarding the purchase of the travel insurance package.