In [6]:
# import pandas as pd
# import numpy as np

# df = pd.read_csv("../data/raw/bank_transaction_Fraud_Detection.csv")

# # 1. Transaction hour from date
# df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'], dayfirst=True)
# df['txn_hour'] = df['Transaction_Date'].dt.hour
# df['txn_weekday'] = df['Transaction_Date'].dt.weekday

# # 2. Customer behavior features
# df['customer_avg_balance'] = df.groupby('Customer_ID')['Account_Balance'].transform('mean')
# df['balance_dev'] = df['Account_Balance'] - df['customer_avg_balance']

# # 3. Device / Location anomaly
# df['new_device'] = df.groupby('Customer_ID')['Transaction_Device'].transform(lambda x: x != x.shift(1)).astype(int)
# df['new_location'] = df.groupby('Customer_ID')['Transaction_Location'].transform(lambda x: x != x.shift(1)).astype(int)

# # 4. High-risk merchant category
# high_risk_merchants = ['Cryptocurrency', 'Bitcoin', 'Jewelry', 'Luxury Goods']  # example
# df['high_risk_merchant'] = df['Merchant_Category'].isin(high_risk_merchants).astype(int)

# # 5. Encode categorical features
# df = pd.get_dummies(df, columns=['Transaction_Device', 'Device_Type', 'Account_Type'], drop_first=True)

# # Save processed dataset
# df.to_csv("../data/processed/bank_transactions_features.csv", index=False)


In [7]:
# import pandas as pd
# from sklearn.metrics import classification_report

# df = pd.read_csv("../data/processed/bank_transactions_features.csv")

# df['rule_fraud'] = 0
# df.loc[
#     (df['Transaction_Amount'] > 2000) &
#     (df['Transaction_Hour'].isin([0,1,2,3,4])) &
#     (df['Transaction_Device_Voice Assistant'] == 1),  # example USSD column
#     'rule_fraud'
# ] = 1

# print(classification_report(df['Is_Fraud'], df['rule_fraud']))


In [8]:
import pandas as pd
from sklearn.metrics import classification_report

# -------------------------------------------------
# Load features
# -------------------------------------------------
df = pd.read_csv("../data/processed/bank_transactions_features.csv")

# -------------------------------------------------
# Initialize rule score
# -------------------------------------------------
df["rule_score"] = 0

# -------------------------------------------------
# R1: Very high transaction amount
# -------------------------------------------------
df.loc[df["Transaction_Amount"] > 3000, "rule_score"] += 3
df.loc[df["Transaction_Amount"].between(1500, 3000), "rule_score"] += 2

# -------------------------------------------------
# R2: Night transactions (12AM â€“ 5AM)
# -------------------------------------------------
df.loc[df["Transaction_Hour"].isin([0,1,2,3,4,5]), "rule_score"] += 2

# -------------------------------------------------
# R3: Risky transaction devices
# -------------------------------------------------
risky_devices = [
    "Transaction_Device_USSD",
    "Transaction_Device_Voice Assistant",
    "Transaction_Device_Unknown"
]

for col in risky_devices:
    if col in df.columns:
        df.loc[df[col] == 1, "rule_score"] += 2

# -------------------------------------------------
# R4: Risky account types
# -------------------------------------------------
risky_accounts = [
    "Account_Type_Wallet",
    "Account_Type_Agent"
]

for col in risky_accounts:
    if col in df.columns:
        df.loc[df[col] == 1, "rule_score"] += 2

# -------------------------------------------------
# R5: Velocity proxy (multiple tx same hour)
# -------------------------------------------------
hour_tx_count = df.groupby("Transaction_Hour")["Transaction_Amount"].transform("count")
df.loc[hour_tx_count > hour_tx_count.quantile(0.95), "rule_score"] += 2

# -------------------------------------------------
# R6: High amount + night combo (strong signal)
# -------------------------------------------------
df.loc[
    (df["Transaction_Amount"] > 2000) &
    (df["Transaction_Hour"].isin([0,1,2,3,4])),
    "rule_score"
] += 3

# -------------------------------------------------
# Final rule-based decision
# -------------------------------------------------
FRAUD_THRESHOLD = 5
df["rule_fraud"] = (df["rule_score"] >= FRAUD_THRESHOLD).astype(int)

# -------------------------------------------------
# Evaluation
# -------------------------------------------------
print("Rule Score Distribution:")
print(df["rule_score"].value_counts().sort_index())

print("\nRule-Based Classification Report:")
print(classification_report(df["Is_Fraud"], df["rule_fraud"]))


Rule Score Distribution:
rule_score
0       2017
2       2998
3     131743
4        554
5      21270
6         21
7       1034
8      38754
9         10
10      1599
Name: count, dtype: int64

Rule-Based Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.69      0.80    189912
           1       0.05      0.31      0.09     10088

    accuracy                           0.67    200000
   macro avg       0.50      0.50      0.44    200000
weighted avg       0.90      0.67      0.76    200000

