# 5 models same Features

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
import pandas as pd
from imblearn.over_sampling import RandomOverSampler

In [26]:
df = pd.read_excel('Threshold_Data.xlsx')
df

Unnamed: 0.1,Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,...,Total_Children,Total_Spent,BuyingPower,TotalPurchases,Dt_CustomerConverted,SinceEnrolled,Interest,CLV,AOV,AppliedPromo
0,0,1826,1970,Graduation,Single,84835.0,0,0,2014-06-16,0,...,0,1190,84240.0,15,2014-06-16,3543,1,5775.575342,79.333333,887.8
1,1,1,1961,Graduation,Single,57091.0,0,0,2014-06-15,0,...,0,577,56802.5,18,2014-06-15,3544,5,2801.216438,32.055556,715.0
2,2,10476,1958,Graduation,Married,67267.0,0,1,2014-05-13,0,...,1,251,67141.5,11,2014-05-13,3577,2,1229.900000,22.818182,587.4
3,3,1386,1967,Graduation,Together,32474.0,1,1,2014-11-05,0,...,2,11,32468.5,4,2014-11-05,3401,7,51.247945,2.750000,503.4
4,4,5371,1989,Graduation,Single,21474.0,1,0,2014-08-04,0,...,1,91,21428.5,8,2014-08-04,3494,7,435.553425,11.375000,521.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2214,2235,10142,1976,PhD,Single,66476.0,0,1,2013-07-03,99,...,1,689,66131.5,20,2013-07-03,3891,103,3672.464384,34.450000,743.4
2215,2236,5263,1977,n_Cycle,Married,31056.0,1,0,2013-01-22,99,...,1,55,31028.5,5,2013-01-22,4053,107,305.363014,11.000000,514.6
2216,2237,22,1976,Graduation,Single,46310.0,1,0,2012-03-12,99,...,1,309,46155.5,14,2012-03-12,4369,107,1849.343836,22.071429,617.0
2217,2238,528,1978,Graduation,Married,65819.0,0,0,2012-11-29,99,...,0,1383,65127.5,20,2012-11-29,4107,102,7780.795890,69.150000,1027.0


In [27]:
selected_columns = [
    'Age',
    'Marital_Status',
    'Education',
    'Income', 
    'Kidhome', 
    'Teenhome',
    'Recency',
    'MntFishProducts', 
    'MntMeatProducts', 
    'MntWines', 
    'MntGoldProds',
    'NumDealsPurchases',
    'NumWebPurchases',
    'NumStorePurchases',
    'NumWebVisitsMonth',
    'Total_Children',
    'HouseHoldMembers',
    'Total_Spent',
    'SinceEnrolled',
    'Response',
    'AOV',
    'CLV',
]
df = df[selected_columns]

In [28]:
def one_hot_encode(data, column, suffix=''):
    encoded = pd.get_dummies(data[column], drop_first= True)
    data = data.drop(column, axis = 1)
    data = data.join(encoded, lsuffix='_data', rsuffix='_encoded')
    # data = data.join(encoded)
    return data
df = one_hot_encode(df, 'Marital_Status')
df = one_hot_encode(df, 'Education')


In [29]:
X = df.drop('Response', axis=1)
y = df['Response']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

oversampler = RandomOverSampler(random_state=0)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [30]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)


# Logistic Regression

In [31]:

clf = LogisticRegression(random_state=0, C=10, penalty='l2')
clf.fit(X_train, y_train)
train_preds = clf.predict(X_train)
preds = clf.predict(scaler.transform(X_test))

acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
auc = roc_auc_score(y_test, preds)

print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)
print("F1: %.4f" % f1)
print("AUC: %.4f" % auc)

Accuracy: 0.7943
Precision: 0.4021
Recall: 0.7879
F1: 0.5324
AUC: 0.7916


# Naive Bayes

In [32]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)
preds = classifier.predict(scaler.transform(X_test))

acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

auc = roc_auc_score(y_test, preds)

print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)
print("F1: %.4f" % f1)
print("AUC: %.4f" % auc)


Accuracy: 0.7207
Precision: 0.2938
Recall: 0.6263
F1: 0.4000
AUC: 0.6817


# K-Nearest Neighbor

In [33]:
classifier = KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=2)
classifier.fit(X_train, y_train)
preds = classifier.predict(scaler.transform(X_test))

acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

auc = roc_auc_score(y_test, preds)

print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)
print("F1: %.4f" % f1)
print("AUC: %.4f" % auc)

Accuracy: 0.8213
Precision: 0.4242
Recall: 0.5657
F1: 0.4848
AUC: 0.7158


# Linear Classification

In [34]:
classifier = SGDClassifier()
classifier.fit(X_train, y_train)
preds = classifier.predict(scaler.transform(X_test))

acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
auc = roc_auc_score(y_test, preds)

print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)
print("F1: %.4f" % f1)
print("AUC: %.4f" % auc)

Accuracy: 0.7417
Precision: 0.3447
Recall: 0.8182
F1: 0.4850
AUC: 0.7733


# Support Vector Machine

In [35]:
classifier = svm.SVC()
classifier.fit(X_train, y_train)
preds = classifier.predict(scaler.transform(X_test))

acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
auc = roc_auc_score(y_test, preds)

print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)
print("F1: %.4f" % f1)
print("AUC: %.4f" % auc)

Accuracy: 0.8408
Precision: 0.4759
Recall: 0.6970
F1: 0.5656
AUC: 0.7815


# Decission Trees

In [36]:
clf = DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=10,
  max_leaf_nodes=30,
  min_samples_split=7, 
  min_samples_leaf=2)
clf.fit(X_train, y_train)
train_preds = clf.predict(X_train)
preds = clf.predict(scaler.transform(X_test))

acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
auc = roc_auc_score(y_test, preds)

print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)
print("F1: %.4f" % f1)
print("AUC: %.4f" % auc)

Accuracy: 0.7072
Precision: 0.3248
Recall: 0.8990
F1: 0.4772
AUC: 0.7864
