In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler



### Credit Card Fraud Detection Project

In [25]:
def inspect_dataset(data:pd.DataFrame)->None:
    # Inspect what the dataset looks like
    print("Data head:\n",data.head(),'\n')
    print("Data shape:\n",data.shape,"\n\nData info:")
    print("\n",data.info(),"\n")

    # Check for missing values
    print("Null values:\n",data.isnull().sum(),'\n')

    # statistical measures about the data
    print("Statistics:\n",data.describe(),'\n')

    # Distribution of Target Variable
    print("Class distribution:\n",data['Class'].value_counts(),'\n')
    print("0: Normal transaction\n1: Fraudulent transaction")
    
    # compare the values for both transactions
    print("Grouped data:\n",data.groupby('Class').mean())
    print("===========================================================================================")
    return


In [26]:
def split_features_targets(data:pd.DataFrame)->tuple[pd.DataFrame,pd.DataFrame,pd.DataFrame,pd.DataFrame]:
    # separating the data to compare statistics
    legit = data[data.Class == 0]
    fraud = data[data.Class == 1]
    print(legit.shape) # (284315, 31)
    print(fraud.shape) # (492, 31)
    
    # We need to match the number of legitimate transactions to fraudulent transactions to avoid sampling bias
    legit_sample = legit.sample(n=492)
    new_dataset = pd.concat([legit_sample, fraud], axis=0)
    
    X = new_dataset.drop(columns='Class', axis=1)
    Y = new_dataset['Class']
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
    return X_train, X_test, Y_train, Y_test

In [27]:
def develop_model(X_train:pd.DataFrame, X_test:pd.DataFrame, Y_train:pd.DataFrame, Y_test:pd.DataFrame)->None:
    model = LogisticRegression(max_iter=500)
    pipe = make_pipeline(StandardScaler(), model)
    
    # training the LogisticRegression model with Training data
    pipe.fit(X_train, Y_train)  # apply scaling on training data
    print("Accuracy on test data: ",pipe.score(X_test, Y_test))  # apply scaling on testing data, without leaking training data.
    return 

In [30]:
def main():
    # Load dataset
    cc_data = pd.read_csv('creditcard.csv')
    inspect_dataset(cc_data)
    X_train, X_test, Y_train, Y_test = split_features_targets(cc_data)
    develop_model(X_train, X_test, Y_train, Y_test)
    return

In [31]:
if __name__=='__main__':
    main()

Data head:
    Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V2