In [3]:
import numpy as np #linear algebra
import pandas as pd #data processing
import matplotlib.pyplot as plt #data viz
import seaborn as sns #data viz
from sklearn.impute import SimpleImputer #imputes missing vals
from datetime import datetime 
from sklearn.preprocessing import StandardScaler, OneHotEncoder #preprocessing
from sklearn.compose import ColumnTransformer #preprocessing
from sklearn.decomposition import PCA #dimensionality reduction
from sklearn.cluster import DBSCAN #clustering
from sklearn.model_selection import train_test_split, GridSearchCV #data split, grid search
from imblearn.over_sampling import SMOTE #balance classes
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.svm import SVC #support vector machine
from sklearn.neighbors import KNeighborsClassifier #knn
from sklearn.naive_bayes import GaussianNB #bayes
from xgboost import XGBClassifier #gradient boosting tree
from sklearn.metrics import accuracy_score, recall_score #calculates accuracy, recall
from sklearn.ensemble import VotingClassifier#ensemble

In [4]:
#Read In Dataset
pd.set_option('display.max_columns', None)
customer_data = pd.read_csv('marketing_campaign.csv',
                           delimiter='\t', index_col='ID')
customer_data.head()

Unnamed: 0_level_0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1
2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0
4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0
6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0
5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0


In [24]:
#Feature Engineering
#Create 'Age' feature from customer's birth year
customer_data['Age'] = customer_data.Year_Birth.apply(lambda x: 2021 - int(x))

#Create 'Days_Since_Customer' feature from time the customer enrolled
customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
now = datetime.now()
customer_data['Days_Since_Customer'] = customer_data.Dt_Customer.apply(lambda x: (now - x).total_seconds()/ (60 * 60 * 24))

#Create 'Fam_Size' feature from the marriage status, number of kids/teens
marital_map = {'Absurd': 1, 'Alone': 1, 'YOLO': 1, 'Single': 1,
              'Married': 2, 'Together': 2, 'Widow': 1, 'Divorced': 1}
customer_data['Marital_Status'] = customer_data.Marital_Status.map(marital_map) #Maps all singles as 1, couples as 2
customer_data['Num_Kids'] = customer_data.Kidhome.values + customer_data.Teenhome.values
customer_data['Fam_Size'] = customer_data.Marital_Status.values + customer_data.Num_Kids.values

#Create 'Num_Accepted' feature from the sum of previous marketting campaigns that were accepted by the customer
customer_data['Num_Accepted'] = customer_data.AcceptedCmp1.values + customer_data.AcceptedCmp2.values + \
                                customer_data.AcceptedCmp3.values + customer_data.AcceptedCmp4.values + \
                                customer_data.AcceptedCmp5.values

#Create 'MntTotal' for total amount spent on all items
customer_data['MntTotal'] = customer_data['MntWines'].values + customer_data['MntFruits'].values + \
                            customer_data['MntMeatProducts'].values + customer_data['MntFishProducts'].values + \
                            customer_data['MntWines'].values + customer_data['MntSweetProducts'].values + \
                            customer_data['MntGoldProds'].values

#Drops the unnecessary features from the original dataset
customer_data.drop(['Dt_Customer', 'Year_Birth', 'AcceptedCmp1', 'AcceptedCmp2',
                    'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Kidhome', 'Teenhome',
                   'Z_CostContact', 'Z_Revenue', 'Num_Kids', 'Marital_Status'],
                   axis=1, inplace=True)
customer_data.head() 

  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Customer'] = pd.to_datetime(customer_data.Dt_Customer)
  customer_data['Dt_Custo

Unnamed: 0_level_0,Education,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Response,Age,Days_Since_Customer,Fam_Size,Num_Accepted,MntTotal
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5524,Graduation,58138.0,58,635,88,546,172,88,88,3,8,10,4,7,0,1,64,3923.365452,1,0,2252
2174,Graduation,46344.0,38,11,1,6,2,1,6,2,1,1,2,5,0,0,67,3077.365452,3,0,38
4141,Graduation,71613.0,26,426,49,127,111,21,42,1,8,2,10,4,0,0,56,3424.365452,2,0,1202
6182,Graduation,26646.0,26,11,4,20,10,3,5,2,2,0,4,6,0,0,37,3017.365452,3,0,64
5324,PhD,58293.0,94,173,43,118,46,27,15,5,5,3,6,5,0,0,40,3273.365452,3,0,595


In [25]:
#Prints the size of the dataset
print('Dataset Shape:', customer_data.shape)
print('-------------------------------')
#Check if any columns contain null
print('Total Nulls Per Column:')
print(customer_data.isnull().sum())

Dataset Shape: (2240, 21)
-------------------------------
Total Nulls Per Column:
Education               0
Income                 24
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
Complain                0
Response                0
Age                     0
Days_Since_Customer     0
Fam_Size                0
Num_Accepted            0
MntTotal                0
dtype: int64


In [26]:
#Imputes the mean
imputer = SimpleImputer(strategy='mean')
imputer.fit(customer_data.Income.values.reshape(-1,1))
customer_data['Income'] = imputer.transform(customer_data.Income.values.reshape(-1,1))

In [40]:
#Calculate percent of responses
pos_resp = customer_data.Response.sum()
total = customer_data.shape[0]
percent = round((pos_resp / total)*100, 2)

print(pos_resp, 'khách hàng đã phản hồi chiến dịch tiếp thị trong tổng số', total, 'cutomers.')
print('Percent Responded: ' +  str(percent) + '%')

334 khách hàng đã phản hồi chiến dịch tiếp thị trong tổng số 2240 cutomers.
Percent Responded: 14.91%


In [28]:
#View feature correlations with the 'Response' column
#Note: 'Response' will be the target for predictive modeling
response_corr_abs = np.abs(customer_data.corr()['Response']).sort_values(ascending=False)[1:]
response_corr = customer_data.corr()['Response'].sort_values(ascending=False)[1:]
print("Correlation Coefficients for 'Response'")
print('--------------------------------------------------------')
print(response_corr)

Correlation Coefficients for 'Response'
--------------------------------------------------------
Num_Accepted           0.426035
MntTotal               0.265518
MntWines               0.247254
MntMeatProducts        0.236335
NumCatalogPurchases    0.220810
Days_Since_Customer    0.174061
NumWebPurchases        0.148730
MntGoldProds           0.139850
Income                 0.132756
MntFruits              0.125289
MntSweetProducts       0.117372
MntFishProducts        0.111331
NumStorePurchases      0.039363
NumDealsPurchases      0.002238
Complain              -0.001707
NumWebVisitsMonth     -0.003987
Age                   -0.021325
Recency               -0.198437
Fam_Size              -0.219440
Name: Response, dtype: float64


In [29]:
#Remove the 'Response' column because it is the target of future predictive model
X, y = customer_data.drop('Response', axis=1).values, customer_data['Response'].values

#Creates a column transformer that sends 'Education' to be encoded and rest scaled
ct = ColumnTransformer([
    ('catagoric', OneHotEncoder(), [0]),
    ('numeric', StandardScaler(), list(range(1, len(X.T))))
])

#Sends the data through the column transformer
X_transformed = ct.fit_transform(X)
print('Preprocessed Data:')
print(X_transformed[0])

Preprocessed Data:
[ 0.          0.          1.          0.          0.          0.23532677
  0.30703926  0.98378127  1.55157698  1.67970233  2.46214705  1.4765001
  0.84320691  0.34941394  1.40930394  2.51089024 -0.55078479  0.69390374
 -0.09728167  0.98534473  1.97674456 -1.75911463 -0.43903713  1.4669731 ]


### Split Dataset and Balance Classes

In [30]:
#Split into train (70%) and test (30%)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=8)

#Split the test set into 2 sets; 1 for test, 1 for validation
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=8)

#Display length of each set
print('Length of Each Dataset:')
print('Training Set:', len(X_train))
print('Validation Set:', len(X_val))
print('Test Set:', len(X_test))

Length of Each Dataset:
Training Set: 1568
Validation Set: 336
Test Set: 336


In [31]:
#Balance the training data set using SMOTE
#create the SMOTE object
sm = SMOTE(random_state=8)

#create new training set with SMOTE object
X_bal, y_bal = sm.fit_resample(X_train, y_train)

#Displays perccent of each class
print('Initial Training Set')
print('Percent "Responded":', y_train.sum()/len(y_train))
print('Balanced Training Set')
print('Percent "Responded":', y_bal.sum()/len(y_bal))

Initial Training Set
Percent "Responded": 0.14732142857142858
Balanced Training Set
Percent "Responded": 0.5


### Create Models

#### Logistic Regression

In [32]:
#Create a Logistic Regression Model
#Params to test in grid search
lr_params = {'solver': ['liblinear'], 'penalty': ['l1'], 'C': [1.0, 0.5, 0.25]}

#grid search
lr_grid = GridSearchCV(LogisticRegression(), lr_params, cv=3, scoring='recall')

#fit the grid to the training set
lr_grid.fit(X_bal, y_bal)

#ID the best model
lr = lr_grid.best_estimator_

#Display Best Parameters
print('Best Parameters:', lr_grid.best_params_)

#Display the metrics for the validation set
lr_preds = lr.predict(X_val)
lr_val_acc = accuracy_score(y_val, lr_preds)
lr_val_rec = recall_score(y_val, lr_preds)
print('Logistic Regression Model Accuracy:', lr_val_acc)
print('Logistic Regression Model Recall:', lr_val_rec)

Best Parameters: {'C': 0.25, 'penalty': 'l1', 'solver': 'liblinear'}
Logistic Regression Model Accuracy: 0.7886904761904762
Logistic Regression Model Recall: 0.8518518518518519


#### Support Vector Machine

In [33]:
#Create a Support Vector machine
#Params to test in grid search
svm_params = {'kernel': ['poly', 'rbf'], 'C': [1.0, 0.5, 0.25], 'gamma': ['scale', 'auto']}

#grid search
svm_grid = GridSearchCV(SVC(), svm_params, cv=3, scoring='recall')

#fit the grid to the training set
svm_grid.fit(X_bal, y_bal)

#ID the best model
svm = svm_grid.best_estimator_

#Display Best Parameters
print('Best Parameters:', svm_grid.best_params_)

#Display the metrics for the validation set
svm_preds = svm.predict(X_val)
svm_val_acc = accuracy_score(y_val, svm_preds)
svm_val_rec = recall_score(y_val, svm_preds)
print('Support Vector Machine Accuracy:', svm_val_acc)
print('Support Vector Machine Recall:', svm_val_rec)

Best Parameters: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
Support Vector Machine Accuracy: 0.8273809523809523
Support Vector Machine Recall: 0.7407407407407407


#### K-Nearest Neighbors

In [35]:
#Create a knn model
#Params to test in grid search
knn_params = {'n_neighbors': [7, 9, 11], 'algorithm': ['ball_tree', 'kd_tree', 'brute'],
             'weights': ['uniform', 'distance']}

#grid search
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=3, scoring='recall')

#fit the grid to the training set
knn_grid.fit(X_bal, y_bal)

#ID the best model
knn = knn_grid.best_estimator_

#Display Best Parameters
print('Best Parameters:', knn_grid.best_params_)

#Display the metrics for the validation set
knn_preds = knn.predict(X_val)
knn_val_acc = accuracy_score(y_val, knn_preds)
knn_val_rec = recall_score(y_val, knn_preds)
print('K-Nearest Neighbors Accuracy:', knn_val_acc)
print('K-Nearest Neighbors Recall:', knn_val_rec)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Best Parameters: {'algorithm': 'ball_tree', 'n_neighbors': 7, 'weights': 'distance'}
K-Nearest Neighbors Accuracy: 0.7529761904761905
K-Nearest Neighbors Recall: 0.8333333333333334


#### Naive Bayes

In [36]:
#Create a naive bayes model
nb = GaussianNB()

#fit the model to the training set
nb.fit(X_bal, y_bal)

#Display the metrics for the validation set
nb_preds = nb.predict(X_val)
nb_val_acc = accuracy_score(y_val, nb_preds)
nb_val_rec = recall_score(y_val, nb_preds)
print('Naive Bayes Accuracy:', nb_val_acc)
print('Naive Bayes Machine Recall:', nb_val_rec)

Naive Bayes Accuracy: 0.6934523809523809
Naive Bayes Machine Recall: 0.6481481481481481


#### Gradient Boosting Tree

In [37]:
#Create a xgboost model
#Params to test in grid search
xgb_params = {'n_estimators': [240, 250, 260], 'max_depth': [15, 16, 17],
             'colsample_bytree': [0.6, 0.7, 0.8, 1.0]}

#grid search
xgb_grid = GridSearchCV(XGBClassifier(use_label_encoder=False, verbosity=0), xgb_params, cv=3, 
                        scoring='recall')

#fit the grid to the training set
xgb_grid.fit(X_bal, y_bal)

#ID the best model
xgb = xgb_grid.best_estimator_

#Display Best Parameters
print('Best Parameters:', xgb_grid.best_params_)

#Display the metrics for the validation set
xgb_preds = xgb.predict(X_val)
xgb_val_acc = accuracy_score(y_val, xgb_preds)
xgb_val_rec = recall_score(y_val, xgb_preds)
print('Gradient Boosting Tree Accuracy:', xgb_val_acc)
print('Gradient Boosting Tree Recall:', xgb_val_rec)







Best Parameters: {'colsample_bytree': 0.8, 'max_depth': 15, 'n_estimators': 250}
Gradient Boosting Tree Accuracy: 0.8988095238095238
Gradient Boosting Tree Recall: 0.6111111111111112


### Ensemble the Models

In [38]:
#Create ensemble model of all the other models
#list of models
models = [('logistic_regression', lr), ('support vector machine', svm), 
        ('knn', knn), ('naive_bayes', nb), ('gradient_boost', xgb)]

#Combine models
ensemble_model = VotingClassifier(estimators=models)

#fit the model on the training set
ensemble_model.fit(X_bal, y_bal)

#Display the metrics for the validation set
ensemble_preds = ensemble_model.predict(X_val)
ensemble_val_acc = accuracy_score(y_val, ensemble_preds)
ensemble_val_rec = recall_score(y_val, ensemble_preds)
print('Ensemble Model Accuracy:', ensemble_val_acc)
print('Ensemble Model Recall:', ensemble_val_rec)



Ensemble Model Accuracy: 0.8571428571428571
Ensemble Model Recall: 0.8518518518518519


In [39]:
#Display the metrics of the Ensemble model on the test set
test_preds = ensemble_model.predict(X_test)
test_acc = accuracy_score(y_test, test_preds)
test_rec = recall_score(y_test, test_preds)
print('Test Set Metrics')
print('Ensemble Model Accuracy:', test_acc)
print('Ensemble Model Recall:', test_rec)

Test Set Metrics
Ensemble Model Accuracy: 0.8511904761904762
Ensemble Model Recall: 0.7551020408163265
