In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display

## 2.1 - KNN par defaut

In [2]:
data = pd.read_csv("data_cleaned.csv")
data.head()

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
0,0.818182,0.163647,1.0,0.0,0.5,0.0,0.5,0.0,0
1,0.454545,0.435262,1.0,1.0,0.0,1.0,1.0,1.0,0
2,0.545455,0.219253,0.0,1.0,0.5,1.0,1.0,0.0,0
3,0.727273,0.939882,1.0,0.0,0.0,1.0,1.0,0.5,0
4,0.090909,0.022339,1.0,0.0,1.0,0.0,1.0,0.0,0


In [3]:
X = data.drop("conversion", axis=1)
y = data["conversion"]

In [4]:
from sklearn.model_selection import train_test_split, cross_val_score

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score , confusion_matrix, precision_score, recall_score, classification_report

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

model = KNeighborsClassifier()
acc = cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()
f1 = cross_val_score(model, X, y, cv=5, scoring='f1').mean()
precision = cross_val_score(model, X, y, cv=5, scoring='precision').mean()
recall = cross_val_score(model, X, y, cv=5, scoring='recall').mean()

print("Accuracy : {}".format(acc))
print("Precision : {}".format(precision))
print("recall : {}".format(recall))
print("f1 : {}".format(f1))

model = KNeighborsClassifier()
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
print("----- TEST ------")
report = classification_report(y_test,y_pred_test, output_dict=True)
test_report = pd.DataFrame(report).transpose()
display(test_report)

Accuracy : 0.8024312945011323
Precision : 0.8803511426229814
recall : 0.6961761105752559
f1 : 0.7533905465642826
----- TEST ------


Unnamed: 0,precision,recall,f1-score,support
0,0.748495,0.890186,0.813215,15499.0
1,0.864985,0.701673,0.774817,15540.0
accuracy,0.795805,0.795805,0.795805,0.795805
macro avg,0.80674,0.79593,0.794016,31039.0
weighted avg,0.806817,0.795805,0.79399,31039.0


## 2.2 Grid Search

In [7]:
from sklearn.model_selection import GridSearchCV

In [8]:
grid_param={'n_neighbors': list(range(2,10)),
 'weights' : ['uniform', 'distance'],
 'metric' : ['euclidean', 'manhattan', 'minkowski']} 

model = KNeighborsClassifier()
grid = GridSearchCV(model, grid_param, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search=grid.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [9]:
print(grid_search.best_params_)

{'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'distance'}


In [10]:
grid_search.best_estimator_

KNeighborsClassifier(metric='euclidean', n_neighbors=2, weights='distance')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [12]:
model = grid.best_estimator_
    
acc = cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()
f1 = cross_val_score(model, X, y, cv=5, scoring='f1').mean()
precision = cross_val_score(model, X, y, cv=5, scoring='precision').mean()
recall = cross_val_score(model, X, y, cv=5, scoring='recall').mean()

print("Accuracy : {}".format(acc))
print("Precision : {}".format(precision))
print("recall : {}".format(recall))
print("f1 : {}".format(f1))

model = grid.best_estimator_
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
print("----- TEST ------")
report = classification_report(y_test,y_pred_test, output_dict=True)
test_report = pd.DataFrame(report).transpose()
display(test_report)

Accuracy : 0.8475493472649591
Precision : 0.8706868265770872
recall : 0.820627378992083
f1 : 0.8178173278152585
----- TEST ------


Unnamed: 0,precision,recall,f1-score,support
0,0.81973,0.863152,0.840881,15499.0
1,0.855901,0.810682,0.832678,15540.0
accuracy,0.836883,0.836883,0.836883,0.836883
macro avg,0.837815,0.836917,0.83678,31039.0
weighted avg,0.837839,0.836883,0.836774,31039.0
