In [32]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score

In [33]:
data = pd.read_csv('https://raw.githubusercontent.com/amankharwal/Website-data/master/TravelInsurancePrediction.csv')
print(data.head())

   Unnamed: 0  Age               Employment Type GraduateOrNot  AnnualIncome  \
0           0   31             Government Sector           Yes        400000   
1           1   31  Private Sector/Self Employed           Yes       1250000   
2           2   34  Private Sector/Self Employed           Yes        500000   
3           3   28  Private Sector/Self Employed           Yes        700000   
4           4   28  Private Sector/Self Employed           Yes        700000   

   FamilyMembers  ChronicDiseases FrequentFlyer EverTravelledAbroad  \
0              6                1            No                  No   
1              7                0            No                  No   
2              4                1            No                  No   
3              3                1            No                  No   
4              8                1           Yes                  No   

   TravelInsurance  
0                0  
1                0  
2                1  
3       

In [34]:
data.drop(columns=['Unnamed: 0'], inplace=True)
data.isnull().sum()

Age                    0
Employment Type        0
GraduateOrNot          0
AnnualIncome           0
FamilyMembers          0
ChronicDiseases        0
FrequentFlyer          0
EverTravelledAbroad    0
TravelInsurance        0
dtype: int64

In [35]:
data['TravelInsurance']=data['TravelInsurance'].map({0:'Not Purchased', 1:'Purchased'})

In [36]:
fig = px.histogram(data, x='Age', color='TravelInsurance', title='How Age affects purchase of Travel Insurance')
fig.show()

In [37]:
fig=px.histogram(data, x='Employment Type', color='TravelInsurance', title='How Employment Type affects purchase of Travel Insurance')
fig.show()

In [38]:
fig=px.histogram(data, x='AnnualIncome', color='TravelInsurance', title='How Annual Income affects purchase of Travel Insurance')
fig.show()

In [39]:
data['GraduateOrNot']=data['GraduateOrNot'].map({'No':0, 'Yes':1})
data['FrequentFlyer']=data['FrequentFlyer'].map({'No':0, 'Yes':1})
data['EverTravelledAbroad']=data['EverTravelledAbroad'].map({'No':0, 'Yes':1})


In [40]:
X=np.array(data[['Age', 'GraduateOrNot', 'FrequentFlyer',  'EverTravelledAbroad', 'AnnualIncome', 'ChronicDiseases', 'FamilyMembers']])
Y=np.array(data['TravelInsurance'])
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)
model=DecisionTreeClassifier()
model.fit(Xtrain, Ytrain)
predictions=model.predict(Xtest)


In [41]:
print(accuracy_score(Ytest, predictions))
print(balanced_accuracy_score(Ytest, predictions))
print(f1_score(predictions, Ytest, pos_label='Purchased'))

0.8140703517587939
0.7760024284571019
0.7109375


In [42]:
from sklearn.model_selection import GridSearchCV
param_grid={'criterion':['gini', 'entropy', 'log_loss'], 'splitter':['best', 'random'],  'max_depth':  [None, 2, 4, 6, 8, 10],
    'max_features': [None, 'sqrt', 'log2', 0.2, 0.4, 0.6, 0.8],}
grid=GridSearchCV(estimator=model, param_grid=param_grid, cv=10)
grid.fit(Xtrain, Ytrain)



840 fits failed out of a total of 2520.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
840 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\henry\miniconda3\envs\jupyter_env\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\henry\miniconda3\envs\jupyter_env\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "C:\Users\henry\miniconda3\envs\jupyter_env\lib\site-packages\sklearn\tree\_classes.py", line 352, in fit
    criterion = CRITERIA_CLF[self.criterion](
KeyError: 'log_loss'



One or more of the test scores are non-finite: [0.76527745 0.75898416 0.7621248

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [None, 2, 4, 6, 8, 10],
                         'max_features': [None, 'sqrt', 'log2', 0.2, 0.4, 0.6,
                                          0.8],
                         'splitter': ['best', 'random']})

In [43]:
print('Cross Validation Score:', grid.best_score_)
print('Best Parameters:', grid.best_params_)

Cross Validation Score: 0.8319918796274182
Best Parameters: {'criterion': 'gini', 'max_depth': 4, 'max_features': None, 'splitter': 'best'}


In [45]:
predictions=grid.predict(Xtest)
print(accuracy_score(predictions, Ytest))
print(balanced_accuracy_score(predictions, Ytest))
print(f1_score(predictions, Ytest, pos_label='Purchased'))

0.8417085427135679
0.8962657232704403
0.7149321266968326
