## Baselin model (KNN)

In [1]:
# importing useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
# loading in the dataset
df = pd.read_csv('creditcard.csv')
print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# selecting the 19 most important features according to our EDA 
X = df[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', \
        'V16', 'V17', 'V21', 'V23', 'V27']]

# scaling the features
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

y = df['Class'] # selecting the target variable

In [4]:
val_count = df['Class'].value_counts()
val_count

0    284315
1       492
Name: Class, dtype: int64

In [5]:
weights = dict(1 / val_count) # to be used as class weights
weights

{0: 3.51722561243691e-06, 1: 0.0020325203252032522}

## <center>Cost matrix for fraud detection</center>

||Actual Positive|Actual Negative|
|:-:|:-:|:-:|
|**Predicted Positive**|administrative cost|administrative cost|
|**Predicted Negative**|transactional cost|0|

### *see [this article](https://towardsdatascience.com/fraud-detection-with-cost-sensitive-machine-learning-24b8760d35d9) for more about cost sensitive machine learning for fraud detection*

In [6]:
admin_cost = 2.5

### our choice of using 2.5euros as the administartive cost because it seems to be the best for our problem as suggested by [this paper](https://www.researchgate.net/publication/262390835_Cost_Sensitive_Credit_Card_Fraud_Detection_Using_Bayes_Minimum_Risk)

## Also, implementing cost sensitive models for sklearn's classifier is challenging. It is as good as creating a new model from the scratch. For simplicity, we will not penalize our model of True Positives (although, the cost matrix suggests we should penalize it for true positives). This will mean that we are only penalizing MISCLASSIFICATIONS (trying to minimize costs due to misclassification and not cost in general).

In [7]:
# defining a function to calculate cost savings
def cost_saving(ytrue, ypred, amount):
    fp = np.sum((ytrue == 0) & (ypred == 1))
    cost = np.sum(fp*admin_cost) + np.sum((amount[(ytrue == 1) & (ypred == 0)]))
    max_cost = np.sum((amount[(ytrue == 1)]))
    savings = 1 - (cost/max_cost)
    
    return savings

In [8]:
cv = StratifiedKFold(n_splits=4, random_state=1)

In [9]:
# defining a function to calculate cost saving per fold (splits) of our cv
def cost_saving_per_split(scores, x, y, cv_object):
    results = []
    for i, (_, test_ind) in zip(range(cv_object.n_splits), cv_object.split(x, y)):
        ypred = scores['estimator'][i].predict(x[test_ind])
        ytrue = y[test_ind]
        amount = df['Amount'].values[test_ind]
        results.append(cost_saving(ytrue, ypred, amount))
        
    return results

In [10]:
# defining a function to return a dataframe of metrics results for each fold in our cv
def get_metric_scores(scores, x, y=y, cv_object=cv):
    ind = ['split_'+str(n) for n in range(1, cv_object.n_splits+1)]
    
    scores_df = pd.DataFrame(index=ind)
    
    scores_df['f1_score'] = scores['test_f1']
    scores_df['auc_pr'] = scores['test_average_precision']
    scores_df['cost_savings'] = cost_saving_per_split(scores, x, y, cv_object)

    return scores_df

## model 

In [13]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [11]:
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y,test_size=0.25, random_state=0, stratify=df['Class'])

In [38]:
!pip install termcolor

Collecting termcolor
  Downloading termcolor-1.1.0.tar.gz (3.9 kB)
Building wheels for collected packages: termcolor
  Building wheel for termcolor (setup.py): started
  Building wheel for termcolor (setup.py): finished with status 'done'
  Created wheel for termcolor: filename=termcolor-1.1.0-py3-none-any.whl size=4835 sha256=6367f44671822ca9259f3470d4a8dc7601e280aeaa97517837cf68e05eed2175
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\3f\e3\ec\8a8336ff196023622fbcb36de0c5a5c218cbb24111d1d4c7f2
Successfully built termcolor
Installing collected packages: termcolor
Successfully installed termcolor-1.1.0


In [40]:
#knn
from termcolor import colored as cl

In [34]:
n = 5

knn = KNeighborsClassifier(n_neighbors = n)
knn.fit(x_train, y_train)
knn_yhat = knn.predict(x_test)

In [42]:
print(cl('F1 score of the KNN model is {}'.format(f1_score(y_test, knn_yhat)), attrs = ['bold'], color = 'green'))

[1m[32mF1 score of the KNN model is 0.8272727272727273[0m


In [43]:
def plot_confusion_matrix(cm, classes, title, normalize = False, cmap = plt.cm.Blues):
    title = 'Confusion Matrix of {}'.format(title)
    if normalize:
        cm = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [46]:
knn_matrix = confusion_matrix(y_test, knn_yhat, labels = [0, 1])
knn_matrix

array([[71073,     6],
       [   32,    91]], dtype=int64)

In [73]:
#Model two

In [13]:
myList = list(range(0,50))
neighbors = list(filter(lambda x: x%2!=0, myList))  #This will give a list of odd numbers only ranging from 0 to 50

CV_Scores = []

for k in neighbors:
    KNN = KNeighborsClassifier(n_neighbors = k, algorithm = 'kd_tree')
    scores = cross_val_score(KNN, x_train, y_train, cv = 4, scoring='recall')
    CV_Scores.append(scores.mean())

In [14]:
CV_Scores

[0.7940334268349696,
 0.7940042075736324,
 0.7805049088359046,
 0.761512388966807,
 0.761541608228144,
 0.7669763908368397,
 0.7724111734455352,
 0.7723819541841983,
 0.7723819541841983,
 0.775099345488546,
 0.7723819541841983,
 0.7696645628798504,
 0.7696645628798504,
 0.7642297802711546,
 0.7615416082281439,
 0.7642589995324918,
 0.7669763908368397,
 0.7642589995324918,
 0.7615416082281439,
 0.7615416082281439,
 0.7615416082281439,
 0.7615416082281439,
 0.7615416082281439,
 0.7615416082281439,
 0.7615416082281439]

In [15]:
best_k = neighbors[CV_Scores.index(max(CV_Scores))]
best_k

1

In [20]:
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [32]:
KNN_best = KNeighborsClassifier(n_neighbors = best_k, algorithm = 'kd_tree')

KNN_best.fit(x_train, y_train)

prediction = KNN_best.predict(x_test)

recallTest = recall_score(y_test, prediction)

F1test = f1_score(y_test, prediction)

print("F1 score Score of the knn classifier for best k values of "+str(best_k)+" is: "+str(F1test))
      
print("Recall Score of the knn classifier for best k values of "+str(best_k)+" is: "+str(recallTest))

#recall = recall_score(y_true, y_pred, average='binary')
#print('Recall: %.3f' % recall)

cm = confusion_matrix(y_test, prediction)

print(cm)

tn, fp, fn, tp = cm.ravel()

(tn, fp, fn, tp)

F1 score Score of the knn classifier for best k values of 1 is: 0.8157894736842105
Recall Score of the knn classifier for best k values of 1 is: 0.7560975609756098
[[71067    12]
 [   30    93]]


(71067, 12, 30, 93)

## Fraud Sensitive model (Not cost sensitive)

In [44]:
fraud_sensitive_model = KNeighborsClassifier()

In [47]:
fraud_sensitive_model =  KNeighborsClassifier(n_neighbors = k, algorithm = 'kd_tree')
fraud_sensitive_scores = cross_validate(fraud_sensitive_model, scaled_X, y, \
                            scoring=['f1', 'average_precision'], cv=cv, n_jobs=4, return_estimator=True)

### Note: we used cross_validate because it can take more than one scoring metrics and it can also return the fitting model for each fold.

In [48]:
fraud_sensitive_results = get_metric_scores(fraud_sensitive_scores, scaled_X)
fraud_sensitive_results

Unnamed: 0,f1_score,auc_pr,cost_savings
split_1,0.765343,0.663164,0.847352
split_2,0.809302,0.735185,0.835785
split_3,0.798283,0.677292,0.561406
split_4,0.777251,0.780702,0.673366


In [49]:
fraud_sensitive_results.mean()

f1_score        0.787545
auc_pr          0.714086
cost_savings    0.729477
dtype: float64

## Cost sensitive model

### we will use sample weight to penalize our model accordingly. admin cost for FP and transactional cost (amount) for FN.

In [52]:
sample_weights = np.array([df['Amount'][ind] if fraud else admin_cost for ind, fraud in enumerate(y)])

In [None]:
cost_sensitive_model = KNeighborsClassifier()

cost_sensitive_model = KNeighborsClassifier(n_neighbors = k, algorithm = 'kd_tree')
cost_sensitive_scores = cross_validate(cost_sensitive_model, scaled_X, y, \
                        scoring=['f1', 'average_precision'], cv=cv, n_jobs=4, return_estimator=True, \
                          fit_params={'sample_weight': sample_weights})

In [71]:
cost_sensitive_model = cost_sensitive_model.fit(x_train, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [213605, 213606]

In [69]:
cost_sensitive_results = get_metric_scores(cost_sensitive_scores, scaled_X)

NotFittedError: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
cost_sensitive_results.mean()

In [58]:
from sklearn.metrics import roc_auc_score

In [70]:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score

smote = SMOTE()

for train_idx, test_idx, in cv.split(scaled_X, y):
    X_train, y_train = scaled_X[train_idx], y[train_idx]
    X_test, y_test = scaled_X[test_idx], y[test_idx]
    X_train_oversampled, y_train_oversampled = smote.fit_sample(X_train, y_train)
    smote_gradient_model = KNeighborsClassifier()
    smote_gradient_model.fit(X_train_oversampled, y_train_oversampled )  
    pred = smote_gradient_model.predict(X_test)
    print(f'roc_auc_score: {roc_auc_score(y_test, pred)}')
    print(f'f-score: {f1_score(y_test, pred)}')

roc_auc_score: 0.9493483547505884
f-score: 0.444
roc_auc_score: 0.8573015116467797
f-score: 0.6494464944649446
roc_auc_score: 0.9126293347937487
f-score: 0.4000000000000001
roc_auc_score: 0.8936336286461432
f-score: 0.6139240506329113


In [72]:
print(f'roc_auc_score: {roc_auc_score(y_test, pred)}')
print(f'f-score: {f1_score(y_test, pred)}')

roc_auc_score: 0.8936336286461432
f-score: 0.6139240506329113
