So the aim here is to create a machine learning algorithm that the company can use to determine the clients that it should target in order to enroll them as new customers. We are therefore trying to classify whether a customer is likely to buy car insurance from the company or not and hence this is a binary classification task.

In [157]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pandas as pd 
import seaborn as sns
import numpy as np

train_df = pd.read_csv('C:/Users/lhaye/Documents/Genesys_Task_Dataset/carInsurance_train.csv')
test_df = pd.read_csv('C:/Users/lhaye/Documents/Genesys_Task_Dataset/carInsurance_test.csv')

print(train_df['Id'])

#LETS PUT THE COLUMNS INTO A LIST SO WE CAN SEE WHICH ONES CORRESPOND TO FEATURES AND WHICH TO 
columns = train_df.columns.tolist()
feature_columns = columns[:-1]
output_column = columns[-1:]
num_train_samples = train_df.shape[0]

print(columns)

#Okay so lets do some analysis of the data 
#print(train_df.shape)

#We can see from this that we have many categorical features which we will have to transform into numeric features
#print(train_df.info)

#Lets also check the types and we can see that the object type represents a categorical feature which we will have to change
#print(train_df.dtypes)

#Any preprocessing we do should be done in a function because we will have to do the same for the test set 
#We need to check for and deal with the following if they are present:
#1 Missing data (na or nan)
#2 Duplicate data - samples that are present twice - we can just remove one if they are identical 
#3 Modify categorical features
#4 Outliers - values that are very high for example we can see that there has been 12 previous attempts to contact one customer
#5 Create some new features from the data
#6 Remove irrelevant features/columns 
 

#Lets get the preprocessed data and then make alterations to it based on the model score 
def preprocess_data(data):
    
    #1 Missing data 
    #print(data.isnull().sum())
    #We can see that the columns with missing data is present where we have categorical data so when we change these 
    #to numeric data we can simply add in a value to represent NA - This should sort this issue 
    
    #2 Duplicate data
    duplicate_rows_df = data[data.duplicated()]
    #print('number of duplicate rows: ', duplicate_rows_df.shape)
    #We can see we have zero duplicated rows of data 
    
    
    #3 Modify all data that is Na with a word to avoid difficulties 
    data['Education'] = data['Education'].fillna("NoVal")
    data['Job'] = data['Job'].fillna("NoVal")
    data['Communication'] = data['Communication'].fillna("NoVal")
    data['Outcome'] = data['Outcome'].fillna("NoVal")

    #We tranlsate the categorical labels into numeric labels 
    ord_enc = OrdinalEncoder()
    #print(data)
    data["Education"] = ord_enc.fit_transform(data[["Education"]])
    data["Marital"] = ord_enc.fit_transform(data[["Marital"]])
    data["Job"] = ord_enc.fit_transform(data[["Job"]])
    data["Outcome"] = ord_enc.fit_transform(data[["Outcome"]])
    data["Communication"] = ord_enc.fit_transform(data[["Communication"]])

    #Create a feature for the duration of the call 
    data['CallStart'] = pd.to_datetime(data["CallStart"])
    data['CallEnd'] = pd.to_datetime(data["CallEnd"])

    data['CallStart'] = (data['CallStart'].dt.hour*60+data['CallStart'].dt.minute)*60 + data['CallStart'].dt.second
    data['CallEnd'] = (data['CallEnd'].dt.hour*60+data['CallEnd'].dt.minute)*60 + data['CallEnd'].dt.second
    
    data['CallDuration'] = data['CallEnd'] - data['CallStart']
    #Now we have the feature and some of the values are quite large so perhaps they should be normalised 
    
    #Now we should drop some features that we dont need 
    data = data.drop(['Id', 'LastContactDay', 'LastContactMonth', 'CallStart', 'CallEnd'], axis=1)
    
    #Now we need to normalise/put into smaller values the following columns
    # Balance
    # DaysPassed
    # NoOfContacts
    
    #NORMALIZATION WE CAN TAKE THIS OUT DEPENDENT ON WHAT ALGORITHM WE USE 
    # Maybe because the max is so infrequent we should do a different form of normalization but we can see later
    data['Balance'] = (data['Balance'] - data['Balance'].min()) / (data['Balance'].max() - data['Balance'].min())
    data['NoOfContacts'] = (data['NoOfContacts'] - data['NoOfContacts'].min()) / (data['NoOfContacts'].max() - data['NoOfContacts'].min())
    data['Age'] = (data['Age'] - data['Age'].min()) / (data['Age'].max() - data['Age'].min())
    data['DaysPassed'] = (data['DaysPassed'] - data['DaysPassed'].min()) / (data['DaysPassed'].max() - data['DaysPassed'].min())
    data['PrevAttempts'] = (data['PrevAttempts'] - data['PrevAttempts'].min()) / (data['PrevAttempts'].max() - data['PrevAttempts'].min())
    data['CallDuration'] = (data['CallDuration'] - data['CallDuration'].min()) / (data['CallDuration'].max() - data['CallDuration'].min())
    data['Job'] = (data['Job'] - data['Job'].min()) / (data['Job'].max() - data['Job'].min())
    data['Education'] = (data['Education'] - data['Education'].min()) / (data['Education'].max() - data['Education'].min())
    data['Marital'] = (data['Marital'] - data['Marital'].min()) / (data['Marital'].max() - data['Marital'].min())
    data['Communication'] = (data['Communication'] - data['Communication'].min()) / (data['Communication'].max() - data['Communication'].min())

    print(len(data))
    
    return data
    #Okay so now we have the features we need so we can return them and see how it would go on a dataset
    
    


0          1
1          2
2          3
3          4
4          5
        ... 
3995    3996
3996    3997
3997    3998
3998    3999
3999    4000
Name: Id, Length: 4000, dtype: int64
['Id', 'Age', 'Job', 'Marital', 'Education', 'Default', 'Balance', 'HHInsurance', 'CarLoan', 'Communication', 'LastContactDay', 'LastContactMonth', 'NoOfContacts', 'DaysPassed', 'PrevAttempts', 'Outcome', 'CallStart', 'CallEnd', 'CarInsurance']


In [158]:
x_train_processed = preprocess_data(train_df)
print(len(output_column))
y_train_vals = train_df[output_column]
x_train_processed = x_train_processed.drop(labels='CarInsurance', axis=1)
print(y_train)


print(x_train_processed)

4000
1
      CarInsurance
3215             0
3126             0
697              0
3613             0
2374             1
...            ...
1130             0
1294             0
860              1
3507             1
3174             0

[2800 rows x 1 columns]
           Age       Job  Marital  Education  Default   Balance  HHInsurance  \
0     0.181818  0.454545      1.0   1.000000        0  0.042138            1   
1     0.181818  0.181818      0.5   0.333333        0  0.041527            1   
2     0.142857  0.454545      1.0   1.000000        0  0.036413            1   
3     0.090909  0.818182      1.0   0.333333        0  0.033811            1   
4     0.155844  0.454545      0.5   1.000000        0  0.056684            0   
...        ...       ...      ...        ...      ...       ...          ...   
3995  0.129870  0.909091      1.0   1.000000        0  0.030136            1   
3996  0.402597  0.090909      0.0   0.666667        0  0.031357            1   
3997  0.116883  0.09

In [285]:
#create a dataframe with the train and test 
#train_df = pd.concat([x_train, y_train], axis=1)

#print(x_train_processed)
#print(len(y_train))

#Now we should create a train and validation split of the data and do some predictions
X_train, X_test, y_train, y_test = train_test_split(x_train_processed, y_train_vals, test_size=0.2, random_state=42)

#print(X_train)
#print(y_train)

In [329]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold, GridSearchCV
from sklearn import model_selection
import numpy as np


#Now lets run using k-fold cross validation to get the best 
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scoring = ['accuracy']

models = [
          ('Logistic Regression', LogisticRegression()), 
          ('Support Vector Machine Linear', svm.SVC(kernel='linear')),
          ('Support Vector Machine Rbf', svm.SVC(kernel='rbf')),
          ('Support Vector Machine NuSVC', svm.NuSVC(gamma='auto')), 
          ('Random Forest', RandomForestClassifier(n_estimators = 100, random_state=1)),
          ('Decision Tree', DecisionTreeClassifier(max_depth=1)),
          ('K-Nearest Neighbours', KNeighborsClassifier(n_neighbors=5)), 
          ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()),
          ('Naive Bayes', MultinomialNB()),
          ('Boosting', GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=1, random_state=1)), 
          ('An Ensemble', VotingClassifier(estimators=[('boost', boosting), ('SVM', SVMmodelNuSVC) , ('rf', RFModel)], voting = 'hard'))
        ]

results = {}

for name, model in models:

    kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=1)
    cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
    avg_accuracy = np.mean(cv_results['test_accuracy']) * 100 
    print(f"The {name} achieves a mean accuracy result of {avg_accuracy} over a 10-fold validation set")
    results[name] = avg_accuracy
    
    
top_3_models = sorted(results, key=results.get, reverse=True)[:3]

for i in range(len(top_3_models)):
    if(i == 0):
        print("The models with the top 3 results are: ")
        print(top_3_models[i])
    else:
        print(top_3_models[i])
        

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


The Logistic Regression achieves a mean accuracy result of 79.40625 over a 10-fold validation set


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


The Support Vector Machine Linear achieves a mean accuracy result of 79.8125 over a 10-fold validation set


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


The Support Vector Machine Rbf achieves a mean accuracy result of 79.18749999999999 over a 10-fold validation set


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


The Support Vector Machine NuSVC achieves a mean accuracy result of 80.3125 over a 10-fold validation set


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


The Random Forest achieves a mean accuracy result of 81.71875 over a 10-fold validation set
The Decision Tree achieves a mean accuracy result of 73.625 over a 10-fold validation set


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


The K-Nearest Neighbours achieves a mean accuracy result of 74.65625000000001 over a 10-fold validation set
The Linear Discriminant Analysis achieves a mean accuracy result of 78.90625 over a 10-fold validation set
The Naive Bayes achieves a mean accuracy result of 65.28124999999999 over a 10-fold validation set


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


The Boosting achieves a mean accuracy result of 82.09375 over a 10-fold validation set


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


The An Ensemble achieves a mean accuracy result of 79.875 over a 10-fold validation set
The models with the top 3 results are: 
Boosting
Random Forest
Support Vector Machine NuSVC


In [None]:
#Now lets take the top 3 and get the best parameters using grid search and then do some analysis of the scores of them
#('Boosting', GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=1, random_state=1)), 
#('Random Forest', RandomForestClassifier(n_estimators = 100, random_state=1)),
#('Support Vector Machine NuSVC', svm.NuSVC(gamma='auto')), 

#RFModel = RandomForestClassifier(random_state=1)

#param_grid  = { 
    #'n_estimators': [40, 60, 80, 100, 200],
    #'max_features': ['auto', 'sqrt', 'log2'],
    #'max_depth' : [1,2,3,4,5,6,7,8],
    #'criterion' :['gini', 'entropy']
#}

#CV_rfc = GridSearchCV(estimator=RFModel, param_grid=param_grid, cv= 5)
#CV_rfc.fit(X_train, y_train)
#print(CV_rfc.best_params_)


#Tomorrow 
#Wrap up model analysis and select the best 
#Then move on to the next part and get that finished tomorrow leaving 
#analysis of features and then some tidying up 

In [331]:
#{'criterion': 'entropy', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 60}
RFModel = RandomForestClassifier(criterion= 'entropy', max_depth= 8, max_features= 'auto', n_estimators= 60, random_state=1)

kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=1)
cv_results = model_selection.cross_validate(RFModel, X_train, y_train, cv=kfold, scoring=scoring)

avg_accuracy = np.mean(cv_results['test_accuracy']) * 100 
print(avg_accuracy)


RFModel = RandomForestClassifier(n_estimators= 100, random_state=1)

kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=1)
cv_results = model_selection.cross_validate(RFModel, X_train, y_train, cv=kfold, scoring=scoring)

avg_accuracy = np.mean(cv_results['test_accuracy']) * 100 
print(avg_accuracy)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


82.40624999999999


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


81.9375


In [291]:
    
        
#Now we can run an algorithm and see 
#start with Logistic regression

#Logistic Regression Model
#LRmodel = LRmodel.fit(X_train, y_train)
LRscores = cross_val_score(LRmodel, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

#Support Vector Model with Linear Kernel 
SVMmodelLinear.fit(X_train, y_train)
LRscores = cross_val_score(SVMmodelLinear, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

#Support Vector Model with RBF Kernel 
SVMmodelRbf.fit(X_train, y_train)

#Support Vector Model with SVC Kernel 
SVMmodelNuSVC.fit(X_train, y_train)

#Random Forest Model
RFModel.fit(X_train, y_train)

#Decision Tree Model 
DTModel.fit(X_train, y_train)

#K-nearest neighbours model
KNNModel.fit(X_train, y_train)

#Linear Discriminant Analysis
LDAModel.fit(X_train, y_train)

#Niave Bayes Model
NaiveBayesModel.fit(X_train, y_train)

#Implement a boosting algorithm
boosting.fit(X_train, y_train)

#Adaboost
Adaboost.fit(X_train, y_train)

#Ensemble with 'hard' vote i.e. majority rules
ensemble.fit(X_train, y_train)

#Implement a stacking algorithm 

#Lets use the logistic regression algorithm and get a result
LRpredictions = LRmodel.predict(X_test)
LRscore = accuracy_score(y_test, LRpredictions)
LRPrecision = metrics.precision_score(y_test, LRpredictions)

#Now lets use SVM and do the same 
SVMpredictionsLinear = SVMmodelLinear.predict(X_test)
SVMscoreLinear = accuracy_score(y_test, SVMpredictionsLinear)
SVMPrecisionLinear = metrics.precision_score(y_test, SVMpredictionsLinear)

SVMpredictionsRbf = SVMmodelRbf.predict(X_test)
SVMscoreRbf = accuracy_score(y_test, SVMpredictionsRbf)
SVMPrecisionRbf = metrics.precision_score(y_test, SVMpredictionsRbf)

SVMpredictionsNuSVC = SVMmodelNuSVC.predict(X_test)
SVMscoreNuSVC = accuracy_score(y_test, SVMpredictionsNuSVC)
SVMPrecisionNuSVC = metrics.precision_score(y_test, SVMpredictionsNuSVC)

RFpredictions = RFModel.predict(X_test)
RFscore = accuracy_score(y_test, RFpredictions)
RFPrecision = metrics.precision_score(y_test, RFpredictions)

DTpredictions = DTModel.predict(X_test)
DTscore = accuracy_score(y_test, DTpredictions)
DTPrecision = metrics.precision_score(y_test, DTpredictions)

KNNpredictions = KNNModel.predict(X_test)
KNNscore = accuracy_score(y_test, KNNpredictions)
KNNPrecision = metrics.precision_score(y_test, KNNpredictions)

LDApredictions = LDAModel.predict(X_test)
LDAscore = accuracy_score(y_test, LDApredictions)
LDAPrecision = metrics.precision_score(y_test, LDApredictions)

NBpredictions = NaiveBayesModel.predict(X_test)
NBscore = accuracy_score(y_test, NBpredictions)
NBPrecision = metrics.precision_score(y_test, NBpredictions)

Boostingpredictions = boosting.predict(X_test)
Boostingscore = accuracy_score(y_test, Boostingpredictions)
BoostingPrecision = metrics.precision_score(y_test, Boostingpredictions)

Adaboostpredictions = Adaboost.predict(X_test)
Adaboostscore = accuracy_score(y_test, Adaboostpredictions)
AdaboostPrecision = metrics.precision_score(y_test, Adaboostpredictions)

Ensemblepredictions = ensemble.predict(X_test)
Ensemblescore = accuracy_score(y_test, Ensemblepredictions)
EnsemblePrecision = metrics.precision_score(y_test, Ensemblepredictions)


#Tasks this evening 
#2) Grid Search for each with their important hyperparameters 
#4) Add proper means of algorithmic analysis such as 
# Area Under the Curve 
# Confusion Matrix
# F1 Score, Precision and Recall 


print("Logistic Regression Accuracy:              ", LRscore)
print("SVM Linear Accuracy:                       ", SVMscoreLinear)
print("SVM SVC Accuracy:                          ", SVMscoreRbf)
print("SVM NuSVC Accuracy:                        ", SVMscoreNuSVC)
print("Random Forrest Accuracy:                   ", RFscore)
print("Decision Tree Accuracy:                    ", DTscore)
print("K-Nearest Neighbours Accuracy:             ", KNNscore)
print("Naive Bayes Accuracy:                      ", NBscore)
print("Linear Discriminant Analysis Accuracy:     ", LDAscore)
print("Boosting Accuracy:                         ", Boostingscore)
print("Adaboost Accuracy:                         ", Adaboostscore)
print("Ensemble Accuracy:                         ", Ensemblescore)

print("")
print("Logistic Regression Precision:            ", LRPrecision)
print("SVM Precision Linear:                     ", SVMPrecisionLinear)
print("SVM Precision SVC:                        ", SVMPrecisionRbf)
print("SVM Precision NuSVC:                      ", SVMPrecisionNuSVC)
print("Random Forest Precision:                  ", RFPrecision)
print("Decision Tree Precision:                  ", DTPrecision)
print("Naive Bayes Precision:                    ", NBPrecision)
print("K-Nearest Neighbours Precision:           ", KNNPrecision)
print("Linear Discriminant Analysis Accuracy:    ", LDAPrecision)
print("Boosting Precision:                       ", BoostingPrecision)
print("Adaboost Precision:                       ", AdaboostPrecision)
print("Ensemble Precision:                       ", EnsemblePrecision)


#Take the best 1/2/3 algorithms and run to see check the ROC curves, f1 scores, precision and recall 
#I will choose the best algorithm then 


Logistic Regression Accuracy:               0.78125
SVM Linear Accuracy:                        0.795
SVM SVC Accuracy:                           0.8075
SVM NuSVC Accuracy:                         0.79625
Random Forrest Accuracy:                    0.82375
Decision Tree Accuracy:                     0.74875
K-Nearest Neighbours Accuracy:              0.73
Naive Bayes Accuracy:                       0.655
Linear Discriminant Analysis Accuracy:      0.77875
Boosting Accuracy:                          0.82125
Adaboost Accuracy:                          0.81125
Ensemble Accuracy:                          0.81875

Logistic Regression Precision:             0.7786561264822134
SVM Precision Linear:                      0.796875
SVM Precision SVC:                         0.8068181818181818
SVM Precision NuSVC:                       0.7976653696498055
Random Forest Precision:                   0.7760252365930599
Decision Tree Precision:                   0.7137546468401487
Naive Bayes Precision