# K Nearest Neighbours  Model

In [1]:
# Custom Functions
from Credit_Func import *

# Import the Basic Libraries
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# Other
import pickle

# Model
from sklearn.neighbors import KNeighborsClassifier

# Sklearn
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.preprocessing import StandardScaler

In [2]:
def plot_confusion_matix(y_val, y_pred, classes, model_name=None):
    import itertools
    
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    print('\n clasification report:\n', classification_report(y_val,y_pred))
    print("-----------------------------------------------")
    print(f"Accuracy: {round(accuracy*100,3)}%")
    print(f"Precision: {round(precision*100,3)}%")
    print(f"Recall: {round(recall*100,3)}%")
    print(f"f1_score: {round(f1*100,3)}%")

    cnf_matrix = confusion_matrix(y_val, y_pred)
    print(f"Frauds: {y_val.sum()} | Missed: {round((cnf_matrix[1][0]/y_val.sum())*100,1)}%")
    
    # Create the basic matrix
    plt.imshow(cnf_matrix,  cmap=plt.cm.Blues) 

    # Add title and axis labels
    plt.title(f'{model_name} Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    # Add appropriate axis scales
    class_names = ['',classes[0],'',classes[1],'']# set(y) # Get class labels to add to matrix
    tick_marks = [-0.5,0,0.5,1,1.5]
    
    # Add appropriate axis scales
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)

    # Add labels to each cell
    thresh = cnf_matrix.max() / 2. # Used for text coloring below
    # Here we iterate through the confusion matrix and append labels to our visualization 
    for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
            plt.text(j, i, cnf_matrix[i, j],
                     horizontalalignment='center',
                     color='white' if cnf_matrix[i, j] > thresh else 'black')

In [3]:
df = pd.read_csv('creditcard.csv')

In [4]:
time = df['Time']
new_time = []
for d in time:
    if d <= 86400:     # There are 86400 seconds in a day
        new_time.append(d)
    else:
        new_time.append(d - 86400)
        
df['Time'] = new_time      

In [5]:
y = df.Class

X = df.drop(columns=['Class'], axis=1)

# TTS

In [6]:
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Data Shapes:")
print(f"X_tv: {X_tv.shape} | X_test: {X_test.shape} | y_tv {y_tv.shape} | y_test {y_test.shape}")
print(f"Number of Frauds in TV: {y_tv.sum()} | {round((y_tv.sum() / y_tv.shape[0])*100, 5)}%")
print(f"Number of Frauds in Test: {y_test.sum()} | {round((y_test.sum() / y_test.shape[0])*100, 5)}%")

Data Shapes:
X_tv: (199364, 30) | X_test: (85443, 30) | y_tv (199364,) | y_test (85443,)
Number of Frauds in TV: 356 | 0.17857%
Number of Frauds in Test: 136 | 0.15917%


In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=0.3, random_state=42)

print(f"X_train: {X_train.shape} | X_val: {X_val.shape} | y_train {y_train.shape} | y_val {y_val.shape}")
print(f"Number of Frauds in Train: {y_train.sum()} | {round((y_train.sum() / y_train.shape[0])*100, 5)}%")
print(f"Number of Frauds in Validation: {y_val.sum()} | {round((y_val.sum() / y_val.shape[0])*100, 5)}%")

X_train: (139554, 30) | X_val: (59810, 30) | y_train (139554,) | y_val (59810,)
Number of Frauds in Train: 258 | 0.18487%
Number of Frauds in Validation: 98 | 0.16385%


# Scaling

In [8]:
scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train)
X_Val_transformed = scaler.transform(X_val)
X_test_transformed = scaler.transform(X_test)
trans = [X_train_transformed, X_Val_transformed, X_test_transformed]

for X in trans:
    print(("mean: "), np.round(X.mean(), 2))
    print(("standard dev: "), np.round(X.std(), 2))

mean:  0.0
standard dev:  1.0
mean:  -0.0
standard dev:  1.0
mean:  0.0
standard dev:  0.99


In [9]:
# Reset data
X_train = X_train_transformed
X_val = X_Val_transformed
X_test = X_test_transformed

# Sampling

In [10]:
from imblearn.over_sampling import SMOTE, ADASYN

## SMOTE

In [11]:
# Previous original class distribution
print("original class distribution:")
print(y_train.value_counts())

# Fit SMOTE to training data
smote = SMOTE(sampling_strategy='auto') # resample all classes but the majority class
X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train) 

# Preview synthetic sample class distribution
print('\n')
print("SMOTE_even class distribution")
print(pd.Series(y_train_smote).value_counts()) 

original class distribution:
0    139296
1       258
Name: Class, dtype: int64


SMOTE_even class distribution
1    139296
0    139296
Name: Class, dtype: int64


## ADASYN

In [12]:
# Previous original class distribution
print("original class distribution:")
print(y_train.value_counts())

# Fit SMOTE to training data
adasyn = ADASYN(sampling_strategy='auto') # resample all classes but the majority class
X_train_adasyn, y_train_adasyn = adasyn.fit_sample(X_train, y_train) 

# Preview synthetic sample class distribution
print('\n')
print("ADASYN class distribution")
print(pd.Series(y_train_adasyn).value_counts()) 

original class distribution:
0    139296
1       258
Name: Class, dtype: int64


ADASYN class distribution
1    139323
0    139296
Name: Class, dtype: int64


# KNN Model

In [13]:
# Construct pipeline
pipe_knn = Pipeline([('clf', KNeighborsClassifier())])

In [14]:
# Set grid search params
param_grid_knn = [{'n_neighbors': [3, 5, 11, 19], 
                   #'weight': ['uniform', 'distance'],
                   'metric': ['manhattan', 'euclidean'],
                   #'p':[1, 2] 
                  }]

# Construct grid search
gs_knn = GridSearchCV(estimator=KNeighborsClassifier(),
            param_grid=param_grid_knn,
           # scoring='roc_auc',
            cv=3, verbose=1, return_train_score = True, n_jobs=1)

# Fit the model
gs_knn.fit(X_train_smote, y_train_smote)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 372.0min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=1,
             param_grid=[{'metric': ['manhattan', 'euclidean'],
                          'n_neighbors': [3, 5, 11, 19]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=1)

In [None]:
from joblib import dump, load
dump(gs_knn, 'knn_model_final.joblib') 

In [None]:
opt_knn = GS_Output_DataFrame(gs_knn)
opt_knn.sort_values(by='mean_test_score', ascending=False)

In [None]:
best_model = gs_knn.best_estimator_
print(gs_xg.best_params_)

In [None]:
y_pred = best_model.predict(X_val)
y_pred

In [None]:
y_prob = best_model.predict_proba(X_val)
y_prob

In [None]:
plot_confusion_matix(y_val, y_pred, ['Normal', "Fraud"], "KNN")

In [None]:
def roc_plot(model,X_train_smote,y_train_smote,X_val,y_val):
    train_prob = model.predict_proba(X_train)[:,1]
    val_prob = model.predict_proba(X_val)[:,1]
    plt.figure(figsize=(7,7))
    for data in [[y_train, train_prob],[y_val, val_prob]]: # ,[y_test, test_prob]
        fpr, tpr, threshold = roc_curve(data[0], data[1])
        plt.plot(fpr, tpr)
    annot(fpr, tpr, threshold)
    plt.plot([0, 1], [0, 1], color='black', linestyle='--')
    plt.ylabel('TPR (power)')
    plt.xlabel('FPR (alpha)')
    plt.legend(['train','val'])
    plt.show()

In [None]:
roc_plot(best_model, X_train,y_train,X_val,y_val)

## Feature Importance

In [None]:
features = pd.DataFrame(best_model.feature_importances_, index=[X_tv.columns], columns=['Importance'])
features = features.loc[features['Importance'] != 0]
# features

In [None]:
def plot_feature_importances(model, X):
    n_features = X.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')
    plt.title('Feature Importance XGBoost', fontsize=18)

plot_feature_importances(best_model, X_tv)

## Threshold Selection

In [None]:
fpr, tpr, threshold = roc_curve(y_val, y_pred)
print(fpr, tpr, threshold)

In [None]:
cnf_matrix = confusion_matrix(y_val, y_pred)
cnf_matrix

In [None]:
prevalence = df['Class'].sum()/df.shape[0]
fraud_cost_coef = 122.21132113821139
normal_cost_coef = 88.29102242231328

In [None]:
fpr, tpr, threshold = roc_curve(y_val, y_prob[:,1])
fpr, tpr, threshold 

In [None]:
def treshold_selection(prevalence, mean_cost_fraud, mean_cost_Nfraud, y_val, y_pred):
    fpr, tpr, threshold = roc_curve(y_val, y_pred)
    cnf_matrix = confusion_matrix(y_val, y_pred)
    TP, FP, FN, TN = cnf_matrix[0][0], cnf_matrix[0][1], cnf_matrix[1][0], cnf_matrix[1][1]
    
    m = ((1-prevalence)/prevalence) * ((mean_cost_Nfraud*(FP - TN))/(mean_cost_fraud*(FN - TP)))
    
    fm = tpr - (m * fpr)
    
    opt = pd.DataFrame(data=fm)
    
    return opt

In [None]:
treshold_selection(prevalence, fraud_cost_coef, normal_cost_coef, y_val, y_pred)