In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import pickle
import os


In [11]:
import zipfile

zip_path = "/content/dataset.zip"
extract_path = "/content/data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extracted files:")
os.listdir("/content/data")


Extracted files:


['data']

In [12]:
data_path = "/content/data/data"

pkl_files = [f for f in os.listdir(data_path) if f.endswith(".pkl")]

len(pkl_files), pkl_files[:5]


(183,
 ['2018-05-01.pkl',
  '2018-06-04.pkl',
  '2018-08-23.pkl',
  '2018-05-18.pkl',
  '2018-07-23.pkl'])

In [13]:
dfs = []
for file in pkl_files:
    path = f"{data_path}/{file}"
    dfs.append(pd.read_pickle(path))

df = pd.concat(dfs, ignore_index=True)

In [14]:
X = df.drop(
    ["TRANSACTION_ID", "TX_DATETIME", "TX_FRAUD"],
    axis=1
)
y = df["TX_FRAUD"]


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [16]:
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)

model.fit(X_train, y_train)


In [17]:
y_pred = model.predict(X_test)

In [18]:
accuracy_score(y_test, y_pred)

1.0

In [19]:
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    347895
           1       1.00      1.00      1.00      2936

    accuracy                           1.00    350831
   macro avg       1.00      1.00      1.00    350831
weighted avg       1.00      1.00      1.00    350831



array([[347895,      0],
       [     0,   2936]])

In [20]:
with open("fraud_model.pkl", "wb") as f:
    pickle.dump(model, f)