So the aim here is to create a machine learning algorithm that the company can use to determine the clients that it should target in order to enroll them as new customers. We are therefore trying to classify whether a customer is likely to buy car insurance from the company or not and hence this is a binary classification task.

In [157]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pandas as pd 
import seaborn as sns


train_df = pd.read_csv('C:/Users/lhaye/Documents/Genesys_Task_Dataset/carInsurance_train.csv')
test_df = pd.read_csv('C:/Users/lhaye/Documents/Genesys_Task_Dataset/carInsurance_test.csv')

print(train_df['Id'])

#LETS PUT THE COLUMNS INTO A LIST SO WE CAN SEE WHICH ONES CORRESPOND TO FEATURES AND WHICH TO 
columns = train_df.columns.tolist()
feature_columns = columns[:-1]
output_column = columns[-1:]
num_train_samples = train_df.shape[0]

print(columns)

#Okay so lets do some analysis of the data 
#print(train_df.shape)

#We can see from this that we have many categorical features which we will have to transform into numeric features
#print(train_df.info)

#Lets also check the types and we can see that the object type represents a categorical feature which we will have to change
#print(train_df.dtypes)

#Any preprocessing we do should be done in a function because we will have to do the same for the test set 
#We need to check for and deal with the following if they are present:
#1 Missing data (na or nan)
#2 Duplicate data - samples that are present twice - we can just remove one if they are identical 
#3 Modify categorical features
#4 Outliers - values that are very high for example we can see that there has been 12 previous attempts to contact one customer
#5 Create some new features from the data
#6 Remove irrelevant features/columns 
 

#Lets get the preprocessed data and then make alterations to it based on the model score 
def preprocess_data(data):
    
    #1 Missing data 
    #print(data.isnull().sum())
    #We can see that the columns with missing data is present where we have categorical data so when we change these 
    #to numeric data we can simply add in a value to represent NA - This should sort this issue 
    
    #2 Duplicate data
    duplicate_rows_df = data[data.duplicated()]
    #print('number of duplicate rows: ', duplicate_rows_df.shape)
    #We can see we have zero duplicated rows of data 
    
    
    #3 Modify all data that is Na with a word to avoid difficulties 
    data['Education'] = data['Education'].fillna("NoVal")
    data['Job'] = data['Job'].fillna("NoVal")
    data['Communication'] = data['Communication'].fillna("NoVal")
    data['Outcome'] = data['Outcome'].fillna("NoVal")

    #We tranlsate the categorical labels into numeric labels 
    ord_enc = OrdinalEncoder()
    #print(data)
    data["Education"] = ord_enc.fit_transform(data[["Education"]])
    data["Marital"] = ord_enc.fit_transform(data[["Marital"]])
    data["Job"] = ord_enc.fit_transform(data[["Job"]])
    data["Outcome"] = ord_enc.fit_transform(data[["Outcome"]])
    data["Communication"] = ord_enc.fit_transform(data[["Communication"]])

    #Create a feature for the duration of the call 
    data['CallStart'] = pd.to_datetime(data["CallStart"])
    data['CallEnd'] = pd.to_datetime(data["CallEnd"])

    data['CallStart'] = (data['CallStart'].dt.hour*60+data['CallStart'].dt.minute)*60 + data['CallStart'].dt.second
    data['CallEnd'] = (data['CallEnd'].dt.hour*60+data['CallEnd'].dt.minute)*60 + data['CallEnd'].dt.second
    
    data['CallDuration'] = data['CallEnd'] - data['CallStart']
    #Now we have the feature and some of the values are quite large so perhaps they should be normalised 
    
    #Now we should drop some features that we dont need 
    data = data.drop(['Id', 'LastContactDay', 'LastContactMonth', 'CallStart', 'CallEnd'], axis=1)
    
    #Now we need to normalise/put into smaller values the following columns
    # Balance
    # DaysPassed
    # NoOfContacts
    
    #NORMALIZATION WE CAN TAKE THIS OUT DEPENDENT ON WHAT ALGORITHM WE USE 
    # Maybe because the max is so infrequent we should do a different form of normalization but we can see later
    data['Balance'] = (data['Balance'] - data['Balance'].min()) / (data['Balance'].max() - data['Balance'].min())
    data['NoOfContacts'] = (data['NoOfContacts'] - data['NoOfContacts'].min()) / (data['NoOfContacts'].max() - data['NoOfContacts'].min())
    data['Age'] = (data['Age'] - data['Age'].min()) / (data['Age'].max() - data['Age'].min())
    data['DaysPassed'] = (data['DaysPassed'] - data['DaysPassed'].min()) / (data['DaysPassed'].max() - data['DaysPassed'].min())
    data['PrevAttempts'] = (data['PrevAttempts'] - data['PrevAttempts'].min()) / (data['PrevAttempts'].max() - data['PrevAttempts'].min())
    data['CallDuration'] = (data['CallDuration'] - data['CallDuration'].min()) / (data['CallDuration'].max() - data['CallDuration'].min())
    data['Job'] = (data['Job'] - data['Job'].min()) / (data['Job'].max() - data['Job'].min())
    data['Education'] = (data['Education'] - data['Education'].min()) / (data['Education'].max() - data['Education'].min())
    data['Marital'] = (data['Marital'] - data['Marital'].min()) / (data['Marital'].max() - data['Marital'].min())
    data['Communication'] = (data['Communication'] - data['Communication'].min()) / (data['Communication'].max() - data['Communication'].min())

    print(len(data))
    
    return data
    #Okay so now we have the features we need so we can return them and see how it would go on a dataset
    
    


0          1
1          2
2          3
3          4
4          5
        ... 
3995    3996
3996    3997
3997    3998
3998    3999
3999    4000
Name: Id, Length: 4000, dtype: int64
['Id', 'Age', 'Job', 'Marital', 'Education', 'Default', 'Balance', 'HHInsurance', 'CarLoan', 'Communication', 'LastContactDay', 'LastContactMonth', 'NoOfContacts', 'DaysPassed', 'PrevAttempts', 'Outcome', 'CallStart', 'CallEnd', 'CarInsurance']


In [158]:
x_train_processed = preprocess_data(train_df)
print(len(output_column))
y_train_vals = train_df[output_column]
x_train_processed = x_train_processed.drop(labels='CarInsurance', axis=1)
print(y_train)


print(x_train_processed)

4000
1
      CarInsurance
3215             0
3126             0
697              0
3613             0
2374             1
...            ...
1130             0
1294             0
860              1
3507             1
3174             0

[2800 rows x 1 columns]
           Age       Job  Marital  Education  Default   Balance  HHInsurance  \
0     0.181818  0.454545      1.0   1.000000        0  0.042138            1   
1     0.181818  0.181818      0.5   0.333333        0  0.041527            1   
2     0.142857  0.454545      1.0   1.000000        0  0.036413            1   
3     0.090909  0.818182      1.0   0.333333        0  0.033811            1   
4     0.155844  0.454545      0.5   1.000000        0  0.056684            0   
...        ...       ...      ...        ...      ...       ...          ...   
3995  0.129870  0.909091      1.0   1.000000        0  0.030136            1   
3996  0.402597  0.090909      0.0   0.666667        0  0.031357            1   
3997  0.116883  0.09

In [159]:
#create a dataframe with the train and test 
#train_df = pd.concat([x_train, y_train], axis=1)

print(x_train_processed)
print(len(y_train))

#Now we should create a train and validation split of the data and do some predictions
X_train, X_test, y_train, y_test = train_test_split(x_train_processed, y_train_vals, test_size=0.3, random_state=42)

print(X_train)
print(y_train)

           Age       Job  Marital  Education  Default   Balance  HHInsurance  \
0     0.181818  0.454545      1.0   1.000000        0  0.042138            1   
1     0.181818  0.181818      0.5   0.333333        0  0.041527            1   
2     0.142857  0.454545      1.0   1.000000        0  0.036413            1   
3     0.090909  0.818182      1.0   0.333333        0  0.033811            1   
4     0.155844  0.454545      0.5   1.000000        0  0.056684            0   
...        ...       ...      ...        ...      ...       ...          ...   
3995  0.129870  0.909091      1.0   1.000000        0  0.030136            1   
3996  0.402597  0.090909      0.0   0.666667        0  0.031357            1   
3997  0.116883  0.090909      1.0   0.666667        0  0.026194            0   
3998  0.233766  0.272727      1.0   1.000000        0  0.036620            1   
3999  0.350649  0.727273      0.5   0.333333        0  0.031486            1   

      CarLoan  Communication  NoOfConta

In [255]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

#Now we can run an algorithm and see 
#start with Logistic regression

LRmodel = LogisticRegression()
LRmodel = LRmodel.fit(X_train, y_train)

SVMmodelLinear = svm.SVC(kernel='linear') # Linear Kernel
SVMmodelLinear.fit(X_train, y_train)

SVMmodelRbf= svm.SVC(kernel='rbf') # Linear Kernel
SVMmodelRbf.fit(X_train, y_train)

SVMmodelNuSVC = svm.NuSVC(gamma='auto')
SVMmodelNuSVC.fit(X_train, y_train)

RFModel = RandomForestClassifier(n_estimators = 150, random_state=1)
RFModel.fit(X_train, y_train)

DTModel = DecisionTreeClassifier(max_depth=1)
DTModel.fit(X_train, y_train)

KNNModel = KNeighborsClassifier(n_neighbors=5)
KNNModel.fit(X_train, y_train)

#Implement a boosting algorithm
boosting = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=1, random_state=1)
boosting.fit(X_train, y_train)

#Ensemble with 'hard' vote i.e. majority rules
ensemble = VotingClassifier( estimators=[('lr', SVMmodelLinear), ('boost', boosting), ('rf', RFModel)], voting = 'hard')
ensemble.fit(X_train, y_train)

#Implement a stacking algorithm 


#Need to have very good understanding of trees, random forest and SVM 




  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  RFModel.fit(X_train, y_train)
  KNNModel.fit(X_train, y_train)
  return f(**kwargs)
  return f(**kwargs)


VotingClassifier(estimators=[('lr', SVC(kernel='linear')),
                             ('boost',
                              GradientBoostingClassifier(learning_rate=1,
                                                         max_depth=1,
                                                         random_state=1)),
                             ('rf',
                              RandomForestClassifier(n_estimators=150,
                                                     random_state=1))])

In [256]:
#Lets use the logistic regression algorithm and get a result
LRpredictions = LRmodel.predict(X_test)
LRscore = accuracy_score(y_test, LRpredictions)
LRPrecision = metrics.precision_score(y_test, LRpredictions)

#Now lets use SVM and do the same 
SVMpredictionsLinear = SVMmodelLinear.predict(X_test)
SVMscoreLinear = accuracy_score(y_test, SVMpredictionsLinear)
SVMPrecisionLinear = metrics.precision_score(y_test, SVMpredictionsLinear)

SVMpredictionsRbf = SVMmodelRbf.predict(X_test)
SVMscoreRbf = accuracy_score(y_test, SVMpredictionsRbf)
SVMPrecisionRbf = metrics.precision_score(y_test, SVMpredictionsRbf)

SVMpredictionsNuSVC = SVMmodelNuSVC.predict(X_test)
SVMscoreNuSVC = accuracy_score(y_test, SVMpredictionsNuSVC)
SVMPrecisionNuSVC = metrics.precision_score(y_test, SVMpredictionsNuSVC)

RFpredictions = RFModel.predict(X_test)
RFscore = accuracy_score(y_test, RFpredictions)
RFPrecision = metrics.precision_score(y_test, RFpredictions)

DTpredictions = DTModel.predict(X_test)
DTscore = accuracy_score(y_test, DTpredictions)
DTPrecision = metrics.precision_score(y_test, DTpredictions)

KNNpredictions = KNNModel.predict(X_test)
KNNscore = accuracy_score(y_test, KNNpredictions)
KNNPrecision = metrics.precision_score(y_test, KNNpredictions)

Ensemblepredictions = ensemble.predict(X_test)
Ensemblescore = accuracy_score(y_test, Ensemblepredictions)
EnsemblePrecision = metrics.precision_score(y_test, Ensemblepredictions)

Boostingpredictions = boosting.predict(X_test)
Boostingscore = accuracy_score(y_test, Boostingpredictions)
BoostingPrecision = metrics.precision_score(y_test, Boostingpredictions)

print("Logistic Regression Accuracy:              ", LRscore)
print("SVM Linear Accuracy:                       ", SVMscoreLinear)
print("SVM SVC Accuracy:                          ", SVMscoreRbf)
print("SVM NuSVC Accuracy:                        ", SVMscoreNuSVC)
print("Random Forrest Accuracy:                   ", RFscore)
print("Decision Tree Accuracy:                    ", DTscore)
print("K-Nearest Neighbours Accuracy:             ", KNNscore)
print("Ensemble Accuracy:                         ", Ensemblescore)
print("Boosting Accuracy:                         ", Boostingscore)

print("")
print("Logistic Regression Precision:            ", LRPrecision)
print("SVM Precision Linear:                     ", SVMPrecisionLinear)
print("SVM Precision SVC:                        ", SVMPrecisionRbf)
print("SVM Precision NuSVC:                      ", SVMPrecisionNuSVC)
print("Random Forest Precision:                  ", RFPrecision)
print("Decision Tree Precision:                  ", DTPrecision)
print("K-Nearest Neighbours Precision:           ", KNNPrecision)
print("Ensemble Precision:                       ", EnsemblePrecision)
print("Boosting Precision:                       ", BoostingPrecision)


Logistic Regression Accuracy:               0.7933333333333333
SVM Linear Accuracy:                        0.8
SVM SVC Accuracy:                           0.8016666666666666
SVM NuSVC Accuracy:                         0.8075
Random Forrest Accuracy:                    0.8333333333333334
Decision Tree Accuracy:                     0.7583333333333333
K-Nearest Neighbours Accuracy:              0.7433333333333333
Ensemble Accuracy:                          0.8233333333333334
Boosting Accuracy:                          0.8158333333333333

Logistic Regression Precision:             0.7927461139896373
SVM Precision Linear:                      0.798469387755102
SVM Precision SVC:                         0.797979797979798
SVM Precision NuSVC:                       0.8091603053435115
Random Forest Precision:                   0.7854166666666667
Decision Tree Precision:                   0.7459893048128342
K-Nearest Neighbours Precision:            0.7034313725490197
Ensemble Precision:        