In [1]:
%matplotlib notebook

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
import matplotlib.pyplot as plt

In [5]:
import seaborn as sns

In [6]:
from sklearn.preprocessing import OrdinalEncoder

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
from sklearn.linear_model import LinearRegression

In [9]:
import sklearn.metrics as metrics

In [10]:
from sklearn.neighbors import KNeighborsRegressor

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
from sklearn.metrics import accuracy_score

In [13]:
insurance = pd.read_csv('travel_insurance.csv')

In [14]:
insurance.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.7,,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,,41


In [15]:
insurance['Agency'].unique()

array(['CBH', 'CWT', 'JZI', 'KML', 'EPX', 'C2B', 'JWT', 'RAB', 'SSI',
       'ART', 'CSR', 'CCR', 'ADM', 'LWC', 'TTW', 'TST'], dtype=object)

In [16]:
insurance['Agency Type'].unique()

array(['Travel Agency', 'Airlines'], dtype=object)

In [17]:
insurance['Distribution Channel'].unique()

array(['Offline', 'Online'], dtype=object)

In [18]:
insurance['Product Name'].unique()

array(['Comprehensive Plan', 'Rental Vehicle Excess Insurance',
       'Value Plan', 'Basic Plan', 'Premier Plan',
       '2 way Comprehensive Plan', 'Bronze Plan', 'Silver Plan',
       'Annual Silver Plan', 'Cancellation Plan',
       '1 way Comprehensive Plan', 'Ticket Protector', '24 Protect',
       'Gold Plan', 'Annual Gold Plan',
       'Single Trip Travel Protect Silver',
       'Individual Comprehensive Plan',
       'Spouse or Parents Comprehensive Plan',
       'Annual Travel Protect Silver',
       'Single Trip Travel Protect Platinum',
       'Annual Travel Protect Gold', 'Single Trip Travel Protect Gold',
       'Annual Travel Protect Platinum', 'Child Comprehensive Plan',
       'Travel Cruise Protect', 'Travel Cruise Protect Family'],
      dtype=object)

In [19]:
insurance['Claim'].unique()

array(['No', 'Yes'], dtype=object)

In [20]:
insurance['Duration'].unique()

array([ 186,   65,   60,   79,   66,   47,   63,   57,   33,    1,   53,
          5,   39,    6,   48,   11,    3,   14,  136,   12,    7,  190,
        364,   29,   28,  153,    4,   54,   24,    9,   45,   35,    8,
        183,   36,   38,   13,   27,   16,   19,   18,  189,  105,   23,
         15,  180,   90,   91,    2,   17,   10,  279,   92,   22,   64,
         37,   31,   41,  126,   50,   55,  181,   76,   43,   56,   20,
        164,   26,  152,   30,   32,  111,   34,  201,   62,   81,   42,
         49,  124,  118,   52,   59,   73,   21,   25,   94,   46,   82,
         40,  130,  388,  369,  368,  114,   85,  133,  103,  110,  147,
        306,   75,   83,   70,  104,  131,  202,  179,   61,  365,  374,
        386,   86,  100,  244,   99,  108,  277,  107,   87,  276,  123,
        122,  148,  278,  204,  112,   78,   97,  142,  351,   68,  163,
        197,   95,  125,   51,   69,   44,   71,   72,  178,   80,  150,
         74,  171,  127,  160,  168,  158,  325,  1

In [21]:
insurance['Net Sales'].unique()

array([-29.  , -49.5 , -39.6 , ...,   1.74, 388.8 ,  11.58])

In [22]:
insurance['Commision (in value)'].unique()

array([ 9.57, 29.7 , 23.76, ..., 21.63, 97.2 ,  3.25])

In [23]:
insurance['Gender'].unique()

array(['F', nan, 'M'], dtype=object)

In [24]:
insurance['Age'].unique()

array([ 81,  71,  32,  41,  44,  29,  37, 118,  47,  48,  64,  36,  53,
        43,  58,  25,  34,  26,  30,  33,  35,  31,  61,  20,  46,  49,
        50,  62,  65,  24,  40,  21,  66,  57,  45,  52,  60,  27,  23,
        39,  59,  28,  67,  38,  72,  51,  55,  54,  69,  22,  78,  42,
        70,  68,  77,  63,  56,  79,  76,  16,  14,  73,  18,  19,  74,
        85,  84,  13,  75,  87,  80,  83,  12,  10,   8,  17,  15,   9,
        11,  86,   3,  82,   1,   5,  88,   2,   4,   0,   7])

In [25]:
insurance.describe()

Unnamed: 0,Duration,Net Sales,Commision (in value),Age
count,63326.0,63326.0,63326.0,63326.0
mean,49.317074,40.702018,9.809992,39.969981
std,101.791566,48.845637,19.804388,14.01701
min,-2.0,-389.0,0.0,0.0
25%,9.0,18.0,0.0,35.0
50%,22.0,26.53,0.0,36.0
75%,53.0,48.0,11.55,43.0
max,4881.0,810.0,283.5,118.0


### Step 01 - Clean Up NaN Values 

In [26]:
insurance.loc[insurance['Gender'].isna()]

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.50,29.70,,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.60,23.76,,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.80,11.88,,41
6,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,47,UNITED STATES,-39.60,23.76,,32
7,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,63,AUSTRALIA,-108.90,65.34,,29
...,...,...,...,...,...,...,...,...,...,...,...
63309,SSI,Airlines,Online,Ticket Protector,No,41,SINGAPORE,11.58,3.25,,48
63310,SSI,Airlines,Online,Ticket Protector,No,69,SINGAPORE,12.87,3.61,,48
63311,SSI,Airlines,Online,Ticket Protector,No,96,SINGAPORE,5.92,1.66,,48
63312,SSI,Airlines,Online,Ticket Protector,No,61,SINGAPORE,6.56,1.84,,48


In [27]:
insurance.loc[insurance['Gender'] == 'M'].count()

Agency                  9347
Agency Type             9347
Distribution Channel    9347
Product Name            9347
Claim                   9347
Duration                9347
Destination             9347
Net Sales               9347
Commision (in value)    9347
Gender                  9347
Age                     9347
dtype: int64

In [28]:
insurance.loc[insurance['Gender'] == 'F'].count()

Agency                  8872
Agency Type             8872
Distribution Channel    8872
Product Name            8872
Claim                   8872
Duration                8872
Destination             8872
Net Sales               8872
Commision (in value)    8872
Gender                  8872
Age                     8872
dtype: int64

In [29]:
insurance.count()

Agency                  63326
Agency Type             63326
Distribution Channel    63326
Product Name            63326
Claim                   63326
Duration                63326
Destination             63326
Net Sales               63326
Commision (in value)    63326
Gender                  18219
Age                     63326
dtype: int64

In [30]:
insurance['Gender'] = insurance['Gender'].fillna('M')

In [31]:
insurance.loc[insurance['Gender'].isna()]

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age


In [32]:
insurance.loc[insurance['Gender'] == 'F']

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.00,9.57,F,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.00,9.57,F,71
5,JZI,Airlines,Online,Value Plan,No,66,UNITED STATES,-121.00,42.35,F,44
12,KML,Travel Agency,Online,Premier Plan,No,53,NORWAY,-130.00,49.40,F,48
21,C2B,Airlines,Online,Bronze Plan,No,12,SINGAPORE,46.15,11.54,F,44
...,...,...,...,...,...,...,...,...,...,...,...
63315,JZI,Airlines,Online,Basic Plan,No,42,AUSTRALIA,22.00,7.70,F,25
63316,JZI,Airlines,Online,Basic Plan,No,42,AUSTRALIA,22.00,7.70,F,25
63317,JZI,Airlines,Online,Basic Plan,No,42,AUSTRALIA,22.00,7.70,F,25
63322,JZI,Airlines,Online,Basic Plan,No,58,CHINA,40.00,14.00,F,40


In [33]:
insurance.loc[insurance['Gender'] == 'M']

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.70,M,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,M,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,M,41
6,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,47,UNITED STATES,-39.6,23.76,M,32
7,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,63,AUSTRALIA,-108.9,65.34,M,29
...,...,...,...,...,...,...,...,...,...,...,...
63319,JZI,Airlines,Online,Basic Plan,No,10,CHINA,35.0,12.25,M,51
63320,JZI,Airlines,Online,Basic Plan,No,5,BRUNEI DARUSSALAM,18.0,6.30,M,27
63321,JZI,Airlines,Online,Basic Plan,No,111,JAPAN,35.0,12.25,M,31
63323,JZI,Airlines,Online,Basic Plan,No,2,MALAYSIA,18.0,6.30,M,57


In [34]:
insurance.describe()

Unnamed: 0,Duration,Net Sales,Commision (in value),Age
count,63326.0,63326.0,63326.0,63326.0
mean,49.317074,40.702018,9.809992,39.969981
std,101.791566,48.845637,19.804388,14.01701
min,-2.0,-389.0,0.0,0.0
25%,9.0,18.0,0.0,35.0
50%,22.0,26.53,0.0,36.0
75%,53.0,48.0,11.55,43.0
max,4881.0,810.0,283.5,118.0


### Step 02 - Encode Categorical Values

In [35]:
ord_enc = OrdinalEncoder()

In [36]:
insurance['Agency_code'] = ord_enc.fit_transform(insurance[['Agency']])

In [37]:
insurance[['Agency_code', 'Agency']].head(5)

Unnamed: 0,Agency_code,Agency
0,3.0,CBH
1,3.0,CBH
2,6.0,CWT
3,6.0,CWT
4,6.0,CWT


In [38]:
insurance['Agency Type_code'] = ord_enc.fit_transform(insurance[['Agency Type']])

In [39]:
insurance[['Agency Type_code', 'Agency Type']].head(6)

Unnamed: 0,Agency Type_code,Agency Type
0,1.0,Travel Agency
1,1.0,Travel Agency
2,1.0,Travel Agency
3,1.0,Travel Agency
4,1.0,Travel Agency
5,0.0,Airlines


In [40]:
insurance['Distribution Channel_code'] = ord_enc.fit_transform(insurance[['Distribution Channel']])

In [41]:
insurance[['Distribution Channel_code', 'Distribution Channel']].head(5)

Unnamed: 0,Distribution Channel_code,Distribution Channel
0,0.0,Offline
1,0.0,Offline
2,1.0,Online
3,1.0,Online
4,1.0,Online


In [42]:
insurance['Product Name_code'] = ord_enc.fit_transform(insurance[['Product Name']])

In [43]:
insurance[['Product Name_code', 'Product Name']].head(5)

Unnamed: 0,Product Name_code,Product Name
0,12.0,Comprehensive Plan
1,12.0,Comprehensive Plan
2,16.0,Rental Vehicle Excess Insurance
3,16.0,Rental Vehicle Excess Insurance
4,16.0,Rental Vehicle Excess Insurance


In [44]:
insurance['Claim_code'] = ord_enc.fit_transform(insurance[['Claim']])

In [45]:
insurance[['Claim_code', 'Claim']].head(25)

Unnamed: 0,Claim_code,Claim
0,0.0,No
1,0.0,No
2,0.0,No
3,0.0,No
4,0.0,No
5,0.0,No
6,0.0,No
7,0.0,No
8,0.0,No
9,0.0,No


In [46]:
insurance['Destination_code'] = ord_enc.fit_transform(insurance[['Destination']])

In [47]:
insurance[['Destination_code', 'Destination']].head(5)

Unnamed: 0,Destination_code,Destination
0,79.0,MALAYSIA
1,79.0,MALAYSIA
2,4.0,AUSTRALIA
3,4.0,AUSTRALIA
4,61.0,ITALY


In [48]:
insurance['Gender_code'] = ord_enc.fit_transform(insurance[['Gender']])

In [49]:
insurance[['Gender_code', 'Gender']].head(5)

Unnamed: 0,Gender_code,Gender
0,0.0,F
1,0.0,F
2,1.0,M
3,1.0,M
4,1.0,M


In [50]:
insurance.describe()

Unnamed: 0,Duration,Net Sales,Commision (in value),Age,Agency_code,Agency Type_code,Distribution Channel_code,Product Name_code,Claim_code,Destination_code,Gender_code
count,63326.0,63326.0,63326.0,63326.0,63326.0,63326.0,63326.0,63326.0,63326.0,63326.0,63326.0
mean,49.317074,40.702018,9.809992,39.969981,6.654913,0.724331,0.982519,9.292029,0.014639,85.257682,0.8599
std,101.791566,48.845637,19.804388,14.01701,2.371884,0.446854,0.131056,6.642143,0.120102,42.031838,0.347094
min,-2.0,-389.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.0,18.0,0.0,35.0,6.0,0.0,1.0,1.0,0.0,56.0,1.0
50%,22.0,26.53,0.0,36.0,7.0,1.0,1.0,10.0,0.0,88.0,1.0
75%,53.0,48.0,11.55,43.0,7.0,1.0,1.0,14.0,0.0,118.0,1.0
max,4881.0,810.0,283.5,118.0,15.0,1.0,1.0,25.0,1.0,148.0,1.0


In [51]:
insurance = insurance.drop(insurance[insurance['Duration'] > insurance['Duration'].mean() + (insurance['Duration'].std()*3)].index)

In [52]:
insurance = insurance.drop(insurance[insurance['Net Sales'] > insurance['Net Sales'].mean() + (insurance['Net Sales'].std()*3)].index)

In [53]:
insurance = insurance.drop(insurance[insurance['Commision (in value)'] > insurance['Commision (in value)'].mean() + (insurance['Commision (in value)'].std()*3)].index)

In [54]:
insurance = insurance.drop(insurance[insurance['Age'] > insurance['Age'].mean() + (insurance['Age'].std()*3)].index)

In [55]:
insurance = insurance.drop(insurance[insurance['Destination_code'] > insurance['Destination_code'].mean() + (insurance['Destination_code'].std()*3)].index)

### Step 03 - Create and Train the Logistic Regression Model

In [56]:
X = insurance.drop(['Agency', 'Agency Type', 'Distribution Channel', 'Product Name', 'Claim', 'Destination', 'Gender'], axis=1)

In [57]:
y = insurance['Claim_code']

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [59]:
lr_model = LogisticRegression(C=1e6, max_iter=1000)

In [60]:
lr_model.fit(X_train, y_train)

In [61]:
y_pred_lg = lr_model.predict(X_train)

In [62]:
lr_model.score(X_test, y_test)

1.0

In [63]:
mse_lr = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_lg)

In [64]:
mse_lr

0.0

In [65]:
accuracy_lr = accuracy_score(y_true=y_train, y_pred=y_pred_lg)

In [66]:
accuracy_lr

1.0

In [67]:
selection = 'Net Sales'

In [68]:
plt.scatter(X_test[selection], y_test, color='b', label='Test Set')
m, b = np.polyfit(X_train[selection], y_pred_lg, 1)
plt.plot(X_train[selection], m*X_train[selection]+b, color='r')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>