### Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
%matplotlib inline
plt.style.use('seaborn')
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import warnings

In [2]:
warnings.filterwarnings('ignore') 

### Get data

In [3]:
## define routes
route = '/home/dsc/Downloads/Car_Insurance/'

In [4]:
## We create 2 dataframes , 1 of train and 1 of test
Data_train=pd.read_csv(route+'carInsurance_train.xls')
Data_test=pd.read_csv(route+'carInsurance_test.xls')

### Explore data

In [5]:
Data_train.shape

(4000, 19)

In [6]:
Data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                4000 non-null   int64 
 1   Age               4000 non-null   int64 
 2   Job               3981 non-null   object
 3   Marital           4000 non-null   object
 4   Education         3831 non-null   object
 5   Default           4000 non-null   int64 
 6   Balance           4000 non-null   int64 
 7   HHInsurance       4000 non-null   int64 
 8   CarLoan           4000 non-null   int64 
 9   Communication     3098 non-null   object
 10  LastContactDay    4000 non-null   int64 
 11  LastContactMonth  4000 non-null   object
 12  NoOfContacts      4000 non-null   int64 
 13  DaysPassed        4000 non-null   int64 
 14  PrevAttempts      4000 non-null   int64 
 15  Outcome           958 non-null    object
 16  CallStart         4000 non-null   object
 17  CallEnd       

In [7]:
### we see the column Outcome has more nulls than the rest, so, as it is no important, we are going to delete it
del Data_train['Outcome']

In [8]:
#### drop the rows with at least 1 null
Data_train=Data_train.dropna()

In [9]:
Data_train.shape

(2961, 18)

In [10]:
Data_train.head()

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,CallStart,CallEnd,CarInsurance
0,1,32,management,single,tertiary,0,1218,1,0,telephone,28,jan,2,-1,0,13:45:20,13:46:30,0
2,3,29,management,single,tertiary,0,637,1,0,cellular,3,jun,1,119,1,16:30:24,16:36:04,1
3,4,25,student,single,primary,0,373,1,0,cellular,11,may,2,-1,0,12:06:43,12:20:22,1
4,5,30,management,married,tertiary,0,2694,0,0,cellular,3,jun,1,-1,0,14:35:44,14:38:56,0
5,6,32,technician,single,tertiary,0,1625,0,0,cellular,22,may,1,109,1,14:58:08,15:11:24,1


In [11]:
## We will create some columns that could be useful for the prediction model
Data_train['Start']=(Data_train['CallStart'].str[0:2]+Data_train['CallStart'].str[3:5]+Data_train['CallStart'].str[6:8]).apply(int)
Data_train['End']=(Data_train['CallEnd'].str[0:2]+Data_train['CallEnd'].str[3:5]+Data_train['CallEnd'].str[6:8]).apply(int)
Data_train['duration']=Data_train['End']-Data_train['Start']

In [12]:
Data_train[['Start','End','duration']].head()

Unnamed: 0,Start,End,duration
0,134520,134630,110
2,163024,163604,580
3,120643,122022,1379
4,143544,143856,312
5,145808,151124,5316


In [13]:
## We separate the features (independant variables) and the target (dependant variable)
X,y = Data_train[['Id', 'Age', 'Default', 'Balance',
       'HHInsurance', 'CarLoan', 'LastContactDay',
                  'NoOfContacts', 'DaysPassed', 'PrevAttempts','duration']],Data_train[['CarInsurance']]

In [14]:
### The first thing to do is to see if the target is balanced to train properlry
Data_train['CarInsurance'].value_counts()

0    1605
1    1356
Name: CarInsurance, dtype: int64

In [15]:
from sklearn.linear_model import LogisticRegression

## create an instance of the classificatiton model LogisticRegression
clf = LogisticRegression()
## Train (fit) the model with data
clf.fit(X,y)

LogisticRegression()

In [16]:
from sklearn.model_selection import cross_val_score

## Test if the model works well
print (f"The accuracy % for this model is {cross_val_score(clf,X,y,scoring='accuracy').mean()*100}")

The accuracy % for this model is 66.63227519256188


In [17]:
### now create the variables for test data

Data_test['Start']=(Data_test['CallStart'].str[0:2]+Data_test['CallStart'].str[3:5]+Data_test['CallStart'].str[6:8]).apply(int)
Data_test['End']=(Data_test['CallEnd'].str[0:2]+Data_test['CallEnd'].str[3:5]+Data_test['CallEnd'].str[6:8]).apply(int)
Data_test['duration']=Data_test['End']-Data_test['Start']

In [18]:
Data_test=Data_test[['Id', 'Age', 'Default', 'Balance',
       'HHInsurance', 'CarLoan', 'LastContactDay',
                  'NoOfContacts', 'DaysPassed', 'PrevAttempts','duration']]

In [22]:
#predictions
clf.predict(Data_test)

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,

### End