# Credit card fraud detection model

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
import mlflow
#from hpsklearn import HyperoptEstimator
#from hpsklearn import any_classifier
#from hpsklearn import any_preprocessing
from hyperopt import tpe
#import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import warnings
warnings.filterwarnings('ignore')

In [2]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Credit Card Fraud Detection")
mlflow.autolog()

2024/08/09 23:06:57 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [6]:
#df= pd.read_csv('/content/my_paypal_creditcard (1).csv')
df= pd.read_csv('data/my_paypal_creditcard (1).csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

# Data Preprocessing

In [8]:
def preprocessing(df_new):

    # downsampling the majority class to have same rows as the minority class
    downsampled_df=resample(df_new[df_new['Class']==0],
                            n_samples=len(df_new[df_new['Class']==1]),
                            random_state=42)

    # merging minority and new downsampled majority
    df_balanced= pd.concat([df_new[df_new['Class']==1], downsampled_df])

    #data splitting into test and train sets
    X= df_balanced.drop('Class', axis=1)
    Y= df_balanced['Class']
    x_train, x_test,y_train, y_test= train_test_split(X, Y, random_state=42, test_size=0.3)

    # Feature Scaling of features
    sc= StandardScaler()
    x_train= sc.fit_transform(x_train)
    x_test= sc.transform(x_test)

    return x_train, x_test,y_train, y_test

In [9]:
x_train, x_test,y_train, y_test = preprocessing(df)
y_trainn=y_train.to_numpy().flatten()
y_testt=y_test.to_numpy().flatten()


In [10]:

search_space={
    'C': hp.uniform('C', 0.05, 3),
    'fit_intercept': hp.choice('fit_intercept', [True, False]),
    'solver':hp.choice('solver',['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']),
    'warm_start': hp.choice('warm_start',[True, False]),
    'max_iter': hp.choice('max_iter', range(1,500, 4))
}


# defining objective function
def objective(params):
    mlflow.sklearn.autolog()
    with mlflow.start_run(nested= True):
        model= LogisticRegression(**params,
                                  random_state=42)
        score= cross_val_score(model, x_train, y_trainn, cv=5, scoring='f1_macro')
        best_score= max(score)
        return {'loss': -best_score, 'status': STATUS_OK}




In [11]:
with mlflow.start_run(run_name="logistic-hyper-opt") as run:
    best_result=fmin(
          fn=objective,
          space=search_space,
          algo=tpe.suggest,
          max_evals=50,
          trials=Trials()
    )

100%|██████████| 50/50 [33:05<00:00, 39.72s/trial, best loss: -0.9782505910165484]


In [15]:
# Experiment with best f1 score in mlflow ui
best_params={
            "C":2.6925812675005436,
            'max_iter':365,
            'random_state':42,
            'fit_intercept':True,
            'solver':'liblinear',
            'warm_start':False
            }

In [21]:
#Model training and testing
lr= LogisticRegression(**best_params)
lr.fit(x_train, y_trainn)
y_pred= lr.predict(x_test)


import pickle
filename = 'models/logistic-fraud-model.bin'
pickle.dump(lr, open(filename, 'wb'))


mlflow.log_params(best_params)
mlflow.log_artifact(local_path="models/logistic-fraud-model.bin", artifact_path="models_pickle")

In [23]:
#Model Evaluation using Area under precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
AUPRC = auc(recall, precision)
print("model accuracy: ", AUPRC)


model accuracy:  0.9714318815218096


In [3]:
# staging models & mlflow client
from mlflow.tracking import MlflowClient

client = MlflowClient()

client.transition_model_version_stage(
    name="best_f1_lr_model", version=2, stage="Staging"
)

client.transition_model_version_stage(
    name="best_f1_lr_model", version=1, stage="Production"
)

<ModelVersion: aliases=[], creation_timestamp=1723223494299, current_stage='Production', description='predicting if credit card transaction is fraudulent or not', last_updated_timestamp=1723243998633, name='best_f1_lr_model', run_id='8e8795f5caba43999c939cfeebd59152', run_link='', source='file:///c:/Users/Jane/credit-card-fraud-detection/mlruns/1/8e8795f5caba43999c939cfeebd59152/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>