In [1]:
import json
import joblib
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score


In [2]:
BASE_DIR = Path(".")
DATA_PATH = BASE_DIR / "data" / "fraud_transactions.csv"

CLEANED_CSV = BASE_DIR / "cleaned_data.csv"
READY_CSV = BASE_DIR / "ready_to_train_data.csv"
MODEL_PATH = BASE_DIR / "fraud_model.joblib"
EVAL_REPORT_PATH = BASE_DIR / "eval_report.json"
REVIEW_QUEUE_PATH = BASE_DIR / "review_queue.csv"

TARGET_COL = "Fraud_Label"
SEED = 42


In [3]:
df = pd.read_csv(DATA_PATH)

assert len(df) > 100, "Dataset must contain more than 100 rows"

print("Loaded dataset shape:", df.shape)


Loaded dataset shape: (50000, 21)


In [5]:
# Remove duplicate rows
df_cleaned = df.drop_duplicates().reset_index(drop=True)

# Parse Timestamp column
df_cleaned["Timestamp"] = pd.to_datetime(df_cleaned["Timestamp"], errors="coerce")

# Handle missing values (simple forward fill + backward fill)
df_cleaned = df_cleaned.ffill().bfill()

# Final check
assert df_cleaned.isnull().sum().sum() == 0, "Missing values still exist"

# Save cleaned data
df_cleaned.to_csv(CLEANED_CSV, index=False)

print("Saved:", CLEANED_CSV)
print("Shape:", df_cleaned.shape)


Saved: cleaned_data.csv
Shape: (50000, 21)


In [6]:
df_feat = df_cleaned.copy()

# Example derived features
df_feat["Hour"] = df_feat["Timestamp"].dt.hour
df_feat["Is_Weekend"] = df_feat["Timestamp"].dt.dayofweek.isin([5, 6]).astype(int)

# Encode categorical columns
label_encoders = {}
for col in df_feat.select_dtypes(include="object").columns:
    if col not in [TARGET_COL]:
        le = LabelEncoder()
        df_feat[col] = le.fit_transform(df_feat[col])
        label_encoders[col] = le

# Drop raw timestamp
df_feat = df_feat.drop(columns=["Timestamp"])

# Save ready-to-train data
df_feat.to_csv(READY_CSV, index=False)

print("Saved:", READY_CSV)
print("Shape:", df_feat.shape)


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  for col in df_feat.select_dtypes(include="object").columns:


Saved: ready_to_train_data.csv
Shape: (50000, 21)


In [7]:
X = df_feat.drop(columns=[TARGET_COL])
y = df_feat[TARGET_COL]

X_train, X_eval, y_train, y_eval = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print("Train size:", X_train.shape)
print("Eval size:", X_eval.shape)


Train size: (40000, 20)
Eval size: (10000, 20)


In [8]:
model = LogisticRegression(
    max_iter=1000,
    random_state=SEED,
    solver="liblinear"
)

model.fit(X_train, y_train)

joblib.dump(model, MODEL_PATH)

print("Model saved at:", MODEL_PATH)


Model saved at: fraud_model.joblib


In [9]:
y_pred = model.predict(X_eval)

precision = precision_score(y_eval, y_pred)
recall = recall_score(y_eval, y_pred)
f1 = f1_score(y_eval, y_pred)

eval_report = {
    "precision": float(precision),
    "recall": float(recall),
    "f1_score": float(f1)
}

with open(EVAL_REPORT_PATH, "w") as f:
    json.dump(eval_report, f, indent=2)

print("Saved:", EVAL_REPORT_PATH)
print(eval_report)


Saved: eval_report.json
{'precision': 0.7258064516129032, 'recall': 0.6302521008403361, 'f1_score': 0.6746626686656672}


In [10]:
# Predicted probabilities
y_proba = model.predict_proba(X_eval)[:, 1]

review_df = X_eval.copy()
review_df["predicted_proba"] = y_proba
review_df["predicted_label"] = (y_proba >= 0.5).astype(int)
review_df["true_label"] = y_eval.values

# Bring back Transaction_ID if present
if "Transaction_ID" in df_cleaned.columns:
    review_df["Transaction_ID"] = df_cleaned.loc[X_eval.index, "Transaction_ID"].values

# Sort by highest risk
review_df = review_df.sort_values("predicted_proba", ascending=False).head(200)

review_df.to_csv(REVIEW_QUEUE_PATH, index=False)

print("Saved:", REVIEW_QUEUE_PATH)
print("Rows:", len(review_df))


Saved: review_queue.csv
Rows: 200
