# Extremely Imbalanced data — Fraud detection

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

Using TensorFlow backend.


In [2]:
cols = ['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
        'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']
df = pd.read_csv('PS_20174392719_1491204439457_log.csv', header = 0, names = cols)
print('df.shape:', df.shape)
df.head()

df.shape: (6362620, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
pd.value_counts(df.isFraud, normalize = True)

0    0.998709
1    0.001291
Name: isFraud, dtype: float64

In [4]:
majority_class = df.isFraud.mode()[0]
y_pred = np.full(shape = df.isFraud.shape, fill_value = majority_class)
accuracy_score(df.isFraud, y_pred)

0.9987091795518198

In [5]:
print(classification_report(df.isFraud, y_pred))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   6354407
           1       0.00      0.00      0.00      8213

    accuracy                           1.00   6362620
   macro avg       0.50      0.50      0.50   6362620
weighted avg       1.00      1.00      1.00   6362620



In [6]:
roc_auc_score(df.isFraud, y_pred)

0.5

In [7]:
X = df.drop(['isFraud', 'type', 'nameOrig', 'nameDest'], axis = 1)
y = df.isFraud
rus = RandomUnderSampler(sampling_strategy=0.8)
X_res, y_res = rus.fit_resample(X, y)
print(X_res.shape, y_res.shape)
print(pd.value_counts(y_res))

(18479, 7) (18479,)
0    10266
1     8213
dtype: int64


In [8]:
cols_numeric = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
                'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud']
df_rus = pd.DataFrame(X_res, columns = cols_numeric)
df_rus.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
0,380.0,1326.06,0.0,0.0,0.0,0.0,0.0
1,44.0,255761.85,13358.0,0.0,1261595.08,1517356.92,0.0
2,394.0,156845.49,1033.0,0.0,463946.66,620792.14,0.0
3,303.0,220862.1,0.0,0.0,601005.42,821867.52,0.0
4,308.0,283196.36,30890.0,314086.36,0.0,0.0,0.0


In [9]:
def train_validation_test_split(
    X, y, train_size=0.8, val_size=0.1, test_size=0.1, 
    random_state=None, shuffle=True):
    assert int(train_size + val_size + test_size + 1e-7) == 1
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=shuffle)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val,    test_size=val_size/(train_size+val_size), 
        random_state=random_state, shuffle=shuffle)
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = train_validation_test_split(
    X_res, y_res, train_size=0.8, val_size=0.1, test_size=0.1, random_state=1)
class_weight = {0: 4, 1: 5}
model = LogisticRegression(class_weight=class_weight)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print('accuracy', accuracy_score(y_val, y_pred))
roc_auc_score(y_val, y_pred)

              precision    recall  f1-score   support

           0       0.94      0.89      0.91      1035
           1       0.86      0.92      0.89       813

    accuracy                           0.90      1848
   macro avg       0.90      0.91      0.90      1848
weighted avg       0.91      0.90      0.90      1848

accuracy 0.9031385281385281




0.9054797939283741

In [10]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy', accuracy_score(y_test, y_pred))
print('ROC AUC score:', roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90      1019
           1       0.86      0.91      0.88       829

    accuracy                           0.89      1848
   macro avg       0.89      0.89      0.89      1848
weighted avg       0.89      0.89      0.89      1848

Accuracy 0.8928571428571429
ROC AUC score: 0.8945239484771252


In [11]:
y_pred = model.predict(X)
print(classification_report(y, y_pred))
print('Accuracy:', accuracy_score(y, y_pred))
print('ROC AUC score:', roc_auc_score(y, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.88      0.94   6354407
           1       0.01      0.92      0.02      8213

    accuracy                           0.88   6362620
   macro avg       0.51      0.90      0.48   6362620
weighted avg       1.00      0.88      0.94   6362620

Accuracy: 0.8835512729033008
ROC AUC score: 0.9039433283904124
