In [48]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV,StratifiedKFold, train_test_split
from collections import Counter
from sklearn.impute import KNNImputer


#Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay,accuracy_score,cohen_kappa_score,log_loss,roc_auc_score,roc_curve
#Pipeline to assemble several steps that can be cross-validated together while setting different parameters
from sklearn.pipeline import Pipeline

In [49]:
#import dataset
df = pd.read_csv("../input/vdataset/vehiclecoupondataset.csv")

#checking dataset info
print('Number of observed instances : ',df.shape[0])
print('Number of Features :',df.shape[1]-1)
print('Dataset Info : ')
df.info()

In [50]:
#List of all the available features
print("Features and Target :\n", df.columns.values)
df.head()

In [51]:
#statistical summary of numerical data
df.describe().T

# Exploratory Data Analysis

In [52]:
#Bar plot to check target balance
df['Y'].value_counts().plot(kind='bar')
print(' Acceptance percentage : ', list(df.Y.value_counts(normalize=True))[0]*100,'%','\n','Denial percentage     : ',(df.Y.value_counts(normalize=True))[0]*100,'%')


In [53]:
#histogram to check for skness in the data
df.hist(bins=25,figsize=(12,10))

In [54]:
# ndistribution of coupon from different outlets
print(sns.countplot(y='coupon',data=df, palette = "Set3"))


In [55]:
sns.countplot(y='coupon',hue='Y',data=df, palette = "Set3")


Although most of the coupons were offered by Coffee House, It attained only a 50\% conversion rate. Comparatively, Restaurants with an average expense per person of less than 20\$ every month give fewer coupons than the coffee house, and carry-out \& takeaways give 2500 coupons have about a 90\% acceptance rate.


In [56]:
#acceptance of coupon between male and female
pd.crosstab(df['gender'], df['Y']).plot(kind = 'bar', title='Coupon Acceptance by gender', color=['#96CAC1', '#F6F6BC'])

In [57]:
sns.lineplot(y=df['destination'].value_counts().to_dict().values(),x=df['destination'].value_counts().to_dict().keys(),palette=['r'])


From the above line graph, it can be seen that people who have no urgent place to go have a high probability of accepting the coupon, whereas people who are on the way home or work have about a 50\% of acceptance rate

In [58]:
#coupon acceptance against destination
sns.countplot(x='destination',hue='Y',data=df,palette = "Set3")


In [59]:
#coupon acceptance by age
sns.lineplot(x=df['age'].value_counts().to_dict().values(),y=df['age'].value_counts().to_dict().keys(),palette=['Set3'])


In [60]:
# line graph of coupon acceptance by occupation
sns.lineplot(x=df['occupation'].value_counts().to_dict().values(),y=df['occupation'].value_counts().to_dict().keys(),palette=['Set3'])


two of the above line graph illustrates that people in their 20s and/or student or unemployed have accepted maximum coupons among all other age groups and professions.
Most customers of all ages accept carryout, takeout, and restaurant (20 or under) coupons. While customers under the age of 21 are more likely to accept Coffee House coupons, customers who are in their 20s are more likely to accept bar coupons, and older customers are less likely to accept bar coupons.

bar chart of coupon acceptance vs frequency of travel to the bar

In [61]:
ax = sns.catplot(x='Y',col='Bar',col_wrap=2,kind='count',data=df,height=2.8,aspect=1.3, palette = 'Set3')


In [62]:
figure,axes = plt.subplots(1, 2, figsize=(16, 6))
sns.countplot(df['age'],hue=df['coupon'],ax=axes[0],  palette = 'Set3')
sns.barplot(ax=axes[1], x="age", y="Y", hue='coupon', ci=None, data=df, palette = 'Set3')
axes[0].set_title('age vs coupon (count)'); axes[1].set_title('age vs coupon (Acceptance ratio)')

In [63]:
def df_group(col):
    dfg = df.groupby([col, 'Y']).agg({'toCoupon_GEQ5min':'count'}).reset_index()
    dfg_sum = df.groupby(col).agg({'toCoupon_GEQ5min':'count'})
    dfg = dfg.merge(dfg_sum, on = col)
    dfg.rename(columns={'toCoupon_GEQ5min_x':'customers', 'toCoupon_GEQ5min_y':'total_customer'}, inplace=True)
    dfg['percentage'] = round((dfg['customers']/dfg['total_customer']) * 100, 2)
    return dfg

def df_insights(param):
    fig, ax = plt.subplots(figsize=(10, 6))
    plt.title(param[3], fontsize=10, color='black', weight='bold')
    sns.barplot(x=param[1][0], y=param[1][1], data=df_group(param[4]), hue='Y', edgecolor='black', palette='Set3')

    plt.xlabel(param[2][0], fontsize=11)
    plt.ylabel(param[2][1], fontsize=11)
    plt.bar_label(ax.containers[0], padding=2)
    plt.bar_label(ax.containers[1], padding=2)

    plt.bar_label(ax.containers[0], param[0][0], label_type='center', color='white', weight='bold')
    plt.bar_label(ax.containers[1], param[0][1], label_type='center', color='white', weight='bold')

    sns.despine()

    plt.savefig('destination.png')
    plt.show()

In [64]:
params = [[[['49.37%', '36.62%', '49.78%'],['50.63%', '63.38%', '50.22%']],['destination','customers'],['Destination','Total Customer'],'Drivers Tend to Receive Coupons\nWhen Driving Has No Urgent Purpose','destination'],
         [[['49.78%', '39.16%', '33.85%', '41.55%', '49.15%'],['50.22%', '60.84%', '66.15%', '58.45%', '50.85%']],['time','customers'],['Time','Total Customer'],'10 AM, 2 PM and 6 PM\nThe Time with the Highest Coupon Acceptance Percentage','time'],
         [[['37.44%', '50.40%'],['62.56%', '49.60%']],['expiration','customers'],['expiration','Total Customer'],'Drivers Tend to Receive Coupons\nWhen Validity Period Within 1 Day','expiration']]
for i in params:
    df_insights(i)

In [65]:
plt.figure(figsize = (22,12))
sns.heatmap(df.corr(), annot=True,cmap="crest")
plt.savefig('ma.png')
plt.show()

In [66]:
#check number of null values in each columns
df.isnull().sum()

In [67]:
# calculate the percentage of missing values per column
nan_count = df.isna().sum()[df.isna().sum()>0].sort_values(ascending=False)
nan_count_perc = round(nan_count*100./df.shape[0],2)

# plot the percentage of missing values per column
nan_perc_barplot = sns.barplot(x=nan_count_perc, y=nan_count_perc.index)
plt.bar_label(nan_perc_barplot.containers[0], fmt='%.1f%%', fontsize=14)
plt.yticks(fontsize=14)
plt.xticks(fontsize=14)
plt.xlim(0,115)
plt.xlabel('Percentage (%)', fontsize=14)
plt.title('Percentage of missing values per column', fontsize=14)

In [68]:
#dropping car couln as it has 99.2% missing values
df.drop(['car'], axis=1)


In [69]:
# check and remove duplicate record in the data
print(df.duplicated().sum())
df = df.drop_duplicates()
print(df.duplicated().sum())


In [70]:
#chek variance of the column toCoupon_GEQ5min
print('toCoupon_GEQ5min column variance : ',df.var()['toCoupon_GEQ5min'])


In [71]:
# ordinal encoding age variable
age_l = []
for i in df['age']:
    if i == 'below21':
        age = '<21'
    elif i == '21' or i == '26':
        age = '21-30'
    elif i == '31' or i == '36':
        age = '31-40'
    elif i == '41' or i == '46':
        age = '41-50'
    else:
        age = '>50'
    age_l.append(age)
df['age'] = age_l

In [72]:
#encoding process
df_dummy = df.copy()
df_le = df.copy().replace({'expiration':{'2h': 0, '1d' : 1},
                    'gender':{'Male': 0, 'Female' : 1},
                    'age':{'<21': 0, '21-30': 1, '31-40': 2, '41-50': 3, '>50': 4},
                    'education':{'Some High School': 0, 'High School Graduate': 1, 'Some college - no degree': 2, 'Associates degree': 3, 'Bachelors degree': 4, 'Graduate degree (Masters or Doctorate)': 5},
                    'Bar':{'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4},
                    'CoffeeHouse':{'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4}, 
                    'CarryAway':{'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4}, 
                    'RestaurantLessThan20':{'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4}, 
                    'Restaurant20To50':{'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4},
                    'temperature':{30: 0, 55: 1, 80: 2},
                    'income':{'Less than $12500':0, '$12500 - $24999':1, '$25000 - $37499':2, '$37500 - $49999':3, '$50000 - $62499':4, '$62500 - $74999':5, '$75000 - $87499':6, '$87500 - $99999':7, '$100000 or More':8},
                    'time':{'7AM':0, '10AM':1, '2PM':2, '6PM':3, '10PM':4}})
#one hot encoding process
ohe_cat = ['destination', 'passanger', 'maritalStatus', 'occupation', 'coupon', 'weather']
df_ohe_cat = pd.get_dummies( df.copy()[ohe_cat], columns=ohe_cat)
#merging label encoding columns and one hot encoding columns
df_encode = pd.concat([df_ohe_cat, df_le], axis = 1)
df_encode = df_encode.drop(columns=['car','destination', 'passanger', 'maritalStatus', 'occupation', 'coupon', 'weather','direction_same'])

In [73]:
#selection of imputaion method
imp = 'knn'

#Imputation 
if imp == 'knn':
    df_encode = pd.DataFrame(KNNImputer(n_neighbors=5).fit_transform(df_encode),columns = df_encode.columns)
elif imp == 'mean':
    df_encode = df_encode.fillna(df_encode.mean())
elif imp == 'median':    
    df_encode = df_encode.fillna(df_encode.median())
elif imp == 'mode':
    df_encode = df_encode.fillna(df_encode.mode())
            
            
x = df_encode.drop('Y', axis=1)
y = df_encode.Y

In [74]:
#Train test spit with 80% for training and 20% for testing.
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [75]:
# result function produces summary of all the performance metrics required for classification problems such as precision, recall, confusion matrix
def results(y_pred):
    print(classification_report(y_test, y_pred))
    print('log loss : ', log_loss(y_test, y_pred, eps=1e-15))
    print('Model Accuracy : ',accuracy_score(y_test, y_pred))
    cf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(cf_matrix, annot=True)


In [76]:
xgb_cl = LogisticRegression()
xgb_cl.fit(X_train, y_train)
results( xgb_cl.predict(X_test))
print('roc_auc_score :',roc_auc_score(y_test, xgb_cl.predict_proba(X_test)[:, 1]))

Using Random forest classifier without hyper parameter tuning

In [77]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
results( rf_clf.predict(X_test))
print('roc_auc_score :',roc_auc_score(y_test, rf_clf.predict_proba(X_test)[:, 1]))

In [78]:
#plot feature importance of random forest before hyper parameter tuning
importances = rf_clf.feature_importances_
sorted_indices = np.argsort(importances)[::-1]
 
f, ax = plt.subplots(figsize=(12,5))
plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[sorted_indices], rotation=90)
plt.tight_layout()
plt.show()

In [79]:
# random forest before with parameter tuning
param_grid = { 
    'n_estimators': [100,200, 500],
    'max_features': ['auto', 'sqrt', 'log2',],
    'max_depth' : [4,5,6,7,8,None],
    'criterion' :['gini', 'entropy']
}
rfgs = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv= StratifiedKFold(n_splits=10))
rfgs.fit(X_train, y_train)

results(rfgs.predict(X_test))

 

In [80]:

rfgs.best_params_

In [81]:
#Random forest using best params
rf_best_pram = RandomForestClassifier(criterion= 'gini',
                                     max_depth= None,
                                     max_features = 'auto',
                                     n_estimators = 100)
rf_best_pram.fit(X_train, y_train)

results(rf_best_pram.predict(X_test))

In [82]:
importances = rf_best_pram.feature_importances_
sorted_indices = np.argsort(importances)[::-1]
 
f, ax = plt.subplots(figsize=(12,5))
plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[sorted_indices], rotation=90)
plt.tight_layout()
plt.show()

In [83]:
print('log loss : ', log_loss(y_test, rf_best_pram.predict(X_test), eps=1e-15))
y_pred_proba = rf_best_pram.predict_proba(X_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
print('AUC Score :', roc_auc_score(y_test, y_pred_proba))
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [85]:
    
def visualizeResults(best_model):#Visualizes the classifiers results for comparison
        clfs = []
        result    = pd.DataFrame(best_model.cv_results_)
#        sort models based on rank after cross validation
        result    = result.sort_values('rank_test_score')
    
        m = [['CatBoostClassifier','CatBoost'],
             ['RandomForestClassifier','RandomForest']
             ,['LogisticRegression','LogisticRegression']]
        
        for i in result['param_classifier']:
            for j in range(len(m)):
                if m[j][0] == type(i).__name__:
                    clfs.append(m[j][1])  
    
#        plot comparison line graph of different classification models
        print(clfs)
        print('results',result['mean_test_score'])
        data_plot = pd.DataFrame({"Classifier": clfs,"Results":result['mean_test_score']})
        sns.lineplot(x = "Classifier", y = "Results", data=data_plot)
        plt.title('Classifiers comparative analysis')
        plt.show()
        
def visualizeResult(result, best_model): #Visualizing results obtained using best model
        dic    = dict(Counter(pd.Series(result)))
        dicTemp = dic.items()
        df = pd.DataFrame(dicTemp, columns=['Target', 'Count'])
        df.plot.bar(x='Target', y='Count', rot=0, color={'#C3553A','#76A3B1'})
        diclist = list(dicTemp)
        print('\n')
        print('Number of accepted coupons : ', diclist[0][1])
        print('Number of Rejected coupons : ', diclist[1][1])
        print('\n')
        
def predict(best_model): #prediction using test data
#         #read test dataset
#         test      = y_test
#         #perform imputation and feature scaling on the test set
#         X_test    = x_test
#         #predict the target value for the given test features
        y_pred    = best_model.predict(X_test)
#         self.pred = y_pred
        #calls visualizeResult to visualize obtained results
        visualizeResult(y_pred, best_model)        
        
def modelFinder(clfs,imp): #compares and finds the best model 
        #Performs immputation and feature scaling on the training set
#         immputed = self.immputation(self.data)        
        #Classifiers pipeline params of chosen classifiers
        params = {
                'catboost' : 
                        {"classifier": [CatBoostClassifier(verbose=False)],
                  'classifier__learning_rate':[0.01,0.05, 0.10, 0.15, 0.20, 0.25, 0.3]
                   ,'classifier__iterations' : [10,100,500,750,1000]
#                          ,"classifier__n_estimators":[10, 100,200, 1000]
                        , "classifier__max_depth" : [3,5,7,9,10,15,20,25],
                  },
    
                'randomforest' : {"classifier": [RandomForestClassifier()],
                'classifier__max_depth' : (3,5,7,9,10,15,20,25),
                  'classifier__criterion' : ('gini', 'entropy')
              , 'classifier__min_samples_split' : (2,4,6)
                   },
          'logisticregression':  {'classifier' : [LogisticRegression()],
                 'classifier__penalty' : ['l1', 'l2'],
                'classifier__C' : np.logspace(-4, 4, 20),
                'classifier__solver' : ['liblinear'],
                'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
                               
                        }
        }
        


        pipe = Pipeline([("classifier", RandomForestClassifier())])
        # Create dictionary with candidate learning algorithms and their hyperparameters
        grid_param = [params[i] for i in clfs ]   
        # create a gridsearch of the pipeline, the fit the best model
        gridsearch = GridSearchCV(pipe, grid_param, cv= StratifiedKFold(n_splits=10), verbose=0,n_jobs=-1) # Fit grid search
        best_model = gridsearch.fit(X_train,y_train)
        print("The mean accuracy of the model is:",best_model.score(X_test,y_test))
        print('\n')
        
        #perform prediction on test data 
        y_pred = best_model.predict(X_test)
        cm = confusion_matrix(y_test,y_pred)
        ax = sns.heatmap(cm, annot=True, cmap='Blues')

        ax.set_title('Confusion Matrix \n\n');
        ax.set_xlabel('\nPredicted Values')
        ax.set_ylabel('Actual Values ');

        ## Ticket labels - List must be in alphabetical order
        ax.xaxis.set_ticklabels(['False','True'])
        ax.yaxis.set_ticklabels(['False','True'])

        ## Display the visualization of the Confusion Matrix.
        print('\n')
        plt.show()
        print(classification_report(y_test,y_pred))
        print('log loss : ', log_loss(y_test, y_pred, eps=1e-15))
        print('\n')
        print('roc_auc_score :',roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1]))
        print('\n')
   
        visualizeResults(best_model)
        predict(best_model)

In [None]:
# red = clf.predict(x_test)
models = ['catboost','logisticregression','randomforest']
modelFinder(models,'knn')

In [86]:
cat_clf = CatBoostClassifier()
cat_clf.fit(X_train, y_train)
results(cat_clf.predict(X_test))

In [87]:
#calculate feature importace of the CatBoost Classifier
ff_feature_importance = cat_clf.get_feature_importance(prettified=True)
#plotting feature importance
plt.figure(figsize=(12, 6));
feature_plot= sns.barplot(x="Importances", y="Feature Id", data=ff_feature_importance,palette="cool");
plt.gcf().set_size_inches(10, 10)

plt.title('Feature importance');

In [88]:
print('log loss : ', log_loss(y_test, cat_clf.predict(X_test), eps=1e-15))
y_pred_proba = cat_clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
print('AUC Score :', roc_auc_score(y_test, y_pred_proba))
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:

parameters = {   
                 'depth'         : [4,5,6,7,8,9, 10],
                 'learning_rate' : [0.01,0.02,0.03,0.04],
                  'iterations'    : [10, 20,30,40,50,60,70,80,90, 100],
                 }

cat_grid = GridSearchCV(estimator=CatBoostClassifier(), param_grid=parameters, cv= StratifiedKFold(n_splits=10))
cat_grid = cat_grid.fit(X_train, y_train)
results(cat_grid.predict(X_test))

In [89]:
cat_grid.best_params_

In [90]:
print('log loss : ', log_loss(y_test, cat_grid.predict(X_test), eps=1e-15))
y_pred_proba = cat_grid.predict_proba(X_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
print('AUC Score :', roc_auc_score(y_test, y_pred_proba))
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [92]:
cat_clf_best_param = CatBoostClassifier(depth= 10, iterations =  100, learning_rate= 0.04)
cat_clf_best_param.fit(X_train, y_train)
results(cat_clf_best_param.predict(X_test))

In [93]:
#calculate feature importace of the CatBoost Classifier
cat_clf_best_param_feature_importance = cat_clf_best_param.get_feature_importance(prettified=True)
#plotting feature importance
plt.figure(figsize=(12, 6));
feature_plot= sns.barplot(x="Importances", y="Feature Id", data=cat_clf_best_param_feature_importance,palette="cool");
plt.gcf().set_size_inches(10, 10)

plt.title('Feature importance');

In [94]:
print('log loss : ', log_loss(y_test, cat_clf_best_param.predict(X_test), eps=1e-15))
y_pred_proba = cat_clf_best_param.predict_proba(X_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
print('AUC Score :', roc_auc_score(y_test, y_pred_proba))
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()