![credit card banner](images/banner-credit-card-fraud.jpg "Credit card fraud detection banner")

## Credict Card Fraud Detection using `LOGISTIC REGRESSION`

<hr style="border:2px solid gray">

### Import libraries

In [2]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_score, accuracy_score, f1_score, classification_report
from utils import extract_files
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import itertools
import pandas as pd
import numpy as np



### Download dataset

Option 1: using the API's Kaggle <br><br>
Note: the Kaggle library doesn't work with Python 3.12

In [None]:
# import kaggle
# kaggle datasets download -d mlg-ulb/creditcardfraud
# extract_files('creditcardfraud.zip')

Option 2: Download from Kaggle website

In [56]:
# https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud/data

### Load data into dataframe

In [32]:
df_credit_card = pd.read_csv('creditcard.csv') 

##### To see info about dataset, go to the other file named: fastEda.ipynb

Let's check the type of target data, it must be an integer. If it isn't an integer, we should change its type. 

In [33]:
df_credit_card.Class.dtype

dtype('int64')

In [34]:
fraud = df_credit_card[df_credit_card.Class == 1]
no_fraud = df_credit_card[df_credit_card.Class == 0]
print('No Fraud - Rows x Columns:', no_fraud.shape)
print('Fraud - Rows x Columns:', fraud.shape)


No Fraud - Rows x Columns: (284315, 31)
Fraud - Rows x Columns: (492, 31)


<hr style="border:1px solid green">

## Unbalanced dataset

In [35]:
df_unbalanced_credit_card = df_credit_card.copy()
df_unbalanced_credit_card.shape

(284807, 31)

#### Define `X` (independent variables)


In [36]:
X_unb = df_unbalanced_credit_card.drop('Class', axis=1)
display(X_unb.shape)
X_unb = np.asarray(X_unb)
X_unb[0:5]

(284807, 30)

array([[ 0.00000000e+00, -1.35980713e+00, -7.27811733e-02,
         2.53634674e+00,  1.37815522e+00, -3.38320770e-01,
         4.62387778e-01,  2.39598554e-01,  9.86979013e-02,
         3.63786970e-01,  9.07941720e-02, -5.51599533e-01,
        -6.17800856e-01, -9.91389847e-01, -3.11169354e-01,
         1.46817697e+00, -4.70400525e-01,  2.07971242e-01,
         2.57905802e-02,  4.03992960e-01,  2.51412098e-01,
        -1.83067779e-02,  2.77837576e-01, -1.10473910e-01,
         6.69280749e-02,  1.28539358e-01, -1.89114844e-01,
         1.33558377e-01, -2.10530535e-02,  1.49620000e+02],
       [ 0.00000000e+00,  1.19185711e+00,  2.66150712e-01,
         1.66480113e-01,  4.48154078e-01,  6.00176493e-02,
        -8.23608088e-02, -7.88029833e-02,  8.51016549e-02,
        -2.55425128e-01, -1.66974414e-01,  1.61272666e+00,
         1.06523531e+00,  4.89095016e-01, -1.43772296e-01,
         6.35558093e-01,  4.63917041e-01, -1.14804663e-01,
        -1.83361270e-01, -1.45783041e-01, -6.90831352e-

#### Define `y` target


In [37]:
y_unb = df_unbalanced_credit_card.Class
# y_unb = np.asarray(df_unbalanced_credit_card.Class) # sss doesn't work with 
y_unb.shape
y_unb[0:5]

0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64

#### Normalize X set

In [38]:
X_unb=preprocessing.StandardScaler().fit(X_unb).transform(X_unb)
X_unb[0:5]

array([[-1.99658302, -0.69424232, -0.04407492,  1.6727735 ,  0.97336551,
        -0.24511658,  0.34706795,  0.19367894,  0.08263728,  0.33112778,
         0.08338555, -0.54040704, -0.61829572, -0.99609892, -0.32461019,
         1.60401384, -0.53683287,  0.24486345,  0.03076993,  0.49628203,
         0.32611802, -0.02492336,  0.38285444, -0.17691133,  0.11050692,
         0.24658544, -0.39217043,  0.33089162, -0.06378115,  0.24496426],
       [-1.99658302,  0.60849633,  0.16117592,  0.1097971 ,  0.31652293,
         0.04348335, -0.06181997, -0.06370021,  0.07125348, -0.23249419,
        -0.15334963,  1.58000285,  1.06608857,  0.4914182 , -0.14998248,
         0.69436042,  0.52943375, -0.13516997, -0.21876258, -0.17908605,
        -0.08961086, -0.3073768 , -0.88007675,  0.16220118, -0.56113055,
         0.3206939 ,  0.26106948, -0.02225568,  0.04460752, -0.34247454],
       [-1.99656197, -0.69350046, -0.81157783,  1.16946849,  0.26823129,
        -0.36457179,  1.35145359,  0.63977564,  0

### Define train/test set

In [39]:
# evaluate to change parameters to enhanced the predictions

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
sss.get_n_splits(X_unb, y_unb)
for i, (train_index, test_index) in enumerate(sss.split(X_unb, y_unb)):
    X_train, X_test = X_unb[train_index], X_unb[test_index]
    y_train, y_test = y_unb[train_index], y_unb[test_index]
    
print ("X Train set: ", X_train.shape)
print ("Y Train set: ", y_train.shape)
print ('==============')
print ("X Test set: ", X_test.shape)
print ("Y Test set: ", y_test.shape)



X Train set:  (227845, 30)
Y Train set:  (227845,)
X Test set:  (56962, 30)
Y Test set:  (56962,)


### Modeling: LogisticRegression

In [259]:
#we use several solvers to evaluate the accuracy for each prediction
log_reg_lib = LogisticRegression(C=0.01, solver='liblinear').fit(X_train, y_train)
log_reg_nwt = LogisticRegression(C=0.01, solver='newton-cg').fit(X_train, y_train)
log_reg_lb = LogisticRegression(C=0.01, solver='lbfgs').fit(X_train, y_train)
log_reg_sag = LogisticRegression(C=0.01, solver='sag').fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Predictions

In [260]:
yhat_lib = log_reg_lib.predict(X_test)
yhat_nwt = log_reg_nwt.predict(X_test)
yhat_lb = log_reg_lb.predict(X_test)
yhat_sag = log_reg_sag.predict(X_test)

#### Metrics

Classification report

In [263]:
print ("Classification Report on testing data \n ============")
print("Libnear:\n ", classification_report(y_test, yhat_lib, target_names=['Not Fraud', 'Fraud']), '\n ============')
print("Newton-cg: \n ", classification_report(y_test, yhat_nwt, target_names=['Not Fraud', 'Fraud']), '\n ============')
print("Lbfgs:\n ", classification_report(y_test, yhat_lb, target_names=['Not Fraud', 'Fraud']), '\n ============')
print("Sag:\n ", classification_report(y_test, yhat_sag, target_names=['Not Fraud', 'Fraud']), '\n ============')



Classification Report on testing data 
Libnear:
                precision    recall  f1-score   support

   Not Fraud       1.00      1.00      1.00     56864
       Fraud       0.80      0.48      0.60        98

    accuracy                           1.00     56962
   macro avg       0.90      0.74      0.80     56962
weighted avg       1.00      1.00      1.00     56962
 
Newton-cg: 
                precision    recall  f1-score   support

   Not Fraud       1.00      1.00      1.00     56864
       Fraud       0.87      0.56      0.68        98

    accuracy                           1.00     56962
   macro avg       0.94      0.78      0.84     56962
weighted avg       1.00      1.00      1.00     56962



Jaccard Score

In [None]:
print ('Jaccard score \n ============')
print("liblinear: ", jaccard_score(y_test, yhat_lib, pos_label=0))
print("newton-cg: ", jaccard_score(y_test, yhat_nwt, pos_label=0))
print("lbfgs: ", jaccard_score(y_test, yhat_lb, pos_label=0))
print("sag: ", jaccard_score(y_test, yhat_sag,pos_label=0))

Accuracy

In [None]:
print ("Accuracy score on testing data \n ============")
print ("libnear: ", accuracy_score(y_test, yhat_lib ))
print ("newton-cg: ", accuracy_score(y_test, yhat_nwt))
print("lbfgs: ", accuracy_score(y_test, yhat_lb))
print("sag: ", accuracy_score(y_test, yhat_sag))

F1-Score

In [None]:
print ("F1 score on testing data \n ============")
print ("libnear: ", f1_score(y_test, yhat_lib))
print ("newton-cg: ", f1_score(y_test, yhat_nwt) )
print("lb: ", f1_score(y_test, yhat_lb))
print ("sag: ", f1_score(y_test, yhat_sag) )

<hr style="border:1px solid green">

## Balanced dataset

In [29]:
# To reminder: the original dataset is unbalanced
print ('No fraudulent transactions: ', no_fraud.shape[0])
print ('Fraudulent transactions: ', fraud.shape[0])

No fraudulent transactions:  284315
Fraudulent transactions:  492


### Define `X`

In [None]:
# define X
X = df_credit_card.drop('Class', axis=1)
display(X.shape)
X = np.asarray(X)
X[0:5]

### Define `y` target

In [41]:
# define y
y = np.asarray(df_credit_card['Class'])
y.shape


(284807,)

### Normalize X

In [48]:
X=preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-1.99658302, -0.69424232, -0.04407492,  1.6727735 ,  0.97336551,
        -0.24511658,  0.34706795,  0.19367894,  0.08263728,  0.33112778,
         0.08338555, -0.54040704, -0.61829572, -0.99609892, -0.32461019,
         1.60401384, -0.53683287,  0.24486345,  0.03076993,  0.49628203,
         0.32611802, -0.02492336,  0.38285444, -0.17691133,  0.11050692,
         0.24658544, -0.39217043,  0.33089162, -0.06378115,  0.24496426],
       [-1.99658302,  0.60849633,  0.16117592,  0.1097971 ,  0.31652293,
         0.04348335, -0.06181997, -0.06370021,  0.07125348, -0.23249419,
        -0.15334963,  1.58000285,  1.06608857,  0.4914182 , -0.14998248,
         0.69436042,  0.52943375, -0.13516997, -0.21876258, -0.17908605,
        -0.08961086, -0.3073768 , -0.88007675,  0.16220118, -0.56113055,
         0.3206939 ,  0.26106948, -0.02225568,  0.04460752, -0.34247454],
       [-1.99656197, -0.69350046, -0.81157783,  1.16946849,  0.26823129,
        -0.36457179,  1.35145359,  0.63977564,  0

### Random Under sampler

Due to the dataset is unbalanced, there are only 492 fraudulent transactions, the training dataset must be reduced in order to balance the positive and negative cases.

In [49]:
rus = RandomUnderSampler(random_state=42)
X_bal, y_bal = rus.fit_resample(X, y)
print('X data set:', X_bal.shape)
print('y target: ', y_bal.shape)
values, count = np.unique(y_bal, return_counts=True, equal_nan=False)
print ('not fraud (',str(values[0]),')', ':', count[0])
print ('fraud:(',str(values[1]),')' ':', count[1])

X data set: (984, 30)
y target:  (984,)
not fraud ( 0 ) : 492
fraud:( 1 ): 492


### Define train/test dataset

In [50]:
#sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
sss.get_n_splits(X, y)
for i, (train_index, test_index) in enumerate(sss.split(X_bal, y_bal)):
    X_train_, X_test_ = X_bal[train_index], X_bal[test_index]
    y_train_, y_test_ = y_bal[train_index], y_bal[test_index]
    
print ("X Train set: ", X_train_.shape)
print ("Y Train set: ", y_train_.shape)
print ('==============')
print ("X Test set: ", X_test_.shape)
print ("Y Test set: ", y_test_.shape)


X Train set:  (787, 30)
Y Train set:  (787,)
X Test set:  (197, 30)
Y Test set:  (197,)


### Modeling: Logistic Regression

In [51]:
#we use several solvers to evaluate the accuracy for each prediction
log_reg_lib = LogisticRegression(C=0.01, solver='liblinear').fit(X_train_, y_train_)
log_reg_nwt = LogisticRegression(C=0.01, solver='newton-cg').fit(X_train_, y_train_)
log_reg_lb = LogisticRegression(C=0.01, solver='lbfgs').fit(X_train_, y_train_)
log_reg_sag = LogisticRegression(C=0.01, solver='sag').fit(X_train_, y_train_)



In [None]:
log_reg_lib.predict_proba(X_test_)

### Predictions

In [52]:
y_pred_lib = log_reg_lib.predict(X_test_)
y_pred_nwt = log_reg_nwt.predict(X_test_)
y_pred_lb = log_reg_lb.predict(X_test_)
y_pred_sag = log_reg_sag.predict(X_test_)

### Metrics

In [54]:
print ("Classification Report on testing data \n ============")
print("Libnear:\n ", classification_report(y_test_, y_pred_lib, target_names=['Not Fraud', 'Fraud']), '\n ============')
print("Newton-cg: \n ", classification_report(y_test_, y_pred_nwt, target_names=['Not Fraud', 'Fraud']), '\n ============')
print("lbfgs: \n ", classification_report(y_test_, y_pred_lb, target_names=['Not Fraud', 'Fraud']), '\n ============')

Classification Report on testing data 
Libnear:
                precision    recall  f1-score   support

   Not Fraud       0.93      0.97      0.95        98
       Fraud       0.97      0.93      0.95        99

    accuracy                           0.95       197
   macro avg       0.95      0.95      0.95       197
weighted avg       0.95      0.95      0.95       197
 
Newton-cg: 
                precision    recall  f1-score   support

   Not Fraud       0.87      0.99      0.93        98
       Fraud       0.99      0.86      0.92        99

    accuracy                           0.92       197
   macro avg       0.93      0.92      0.92       197
weighted avg       0.93      0.92      0.92       197
 
lbfgs: 
                precision    recall  f1-score   support

   Not Fraud       0.87      0.99      0.93        98
       Fraud       0.99      0.86      0.92        99

    accuracy                           0.92       197
   macro avg       0.93      0.92      0.92       19

Jaccard Score

In [None]:
print ('Jaccard score \n ============')
print("liblinear: ", jaccard_score(y_test_, y_pred_lib, pos_label=0))
print("newton-cg: ", jaccard_score(y_test_, y_pred_nwt, pos_label=0))
print("lbfgs: ", jaccard_score(y_test_, y_pred_lb, pos_label=0))
print("sag: ", jaccard_score(y_test_, y_pred_sag,pos_label=0))

Accuracy

In [None]:
print ("Accuracy score on testing data \n ============")
print ("libnear: ", accuracy_score(y_test_, y_pred_lib ))
print ("newton-cg: ", accuracy_score(y_test, y_pred_nwt))
print("lbfgs: ", accuracy_score(y_test_, y_pred_lb))
print("sag: ", accuracy_score(y_test_, y_pred_sag))

F1-Score

In [None]:
print ("F1 score on testing data \n ============")
print ("libnear: ", f1_score(y_test_, y_pred_lib))
print ("newton-cg: ", f1_score( y_test_, y_pred_nwt) )
print("lb: ", f1_score(y_test_, y_pred_lb))
print ("sag: ", f1_score(y_test_, y_pred_sag) )