In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ==========================================
# 0. DATA GENERATION (Simulating Source Files)
# ==========================================
def generate_source_files():
    print("[SOURCE] Generating dummy data files...")

    # SALES HEADER (The Truth)
    sales_data = {
        'transaction_id': ['T1001', 'T1002', 'T1003', 'T1004', 'T1005', 'T1006'],
        'customer_id': ['C001', 'C002', 'C003', 'C001', 'C004', 'C005'],
        'date': ['2023-10-01', '2023-10-02', '2023-10-05', '2023-10-10', '2023-10-15', '2023-10-20'],
        'payment_mode': ['Credit', 'UPI', 'Cash', 'Credit', 'UPI', 'Credit'],
        'total_amount': [100.0, 500.0, 50.0, 200.0, 1000.0, 120.0]
    }

    # REFUND TRANSACTIONS (The Target)
    # Added 'refund_mode' to test "Mismatch in payment mode" requirement
    refund_data = {
        'refund_id': ['R001', 'R002', 'R003', 'R004', 'R005', 'R006', 'R007'],
        'original_transaction_id': ['T1001', 'T1002', 'T1003', 'T1001', 'T1005', 'T1006', 'T1001'],
        'customer_id': ['C001', 'C002', 'C999', 'C001', 'C004', 'C005', 'C001'],
        'refund_amount': [100.0, 600.0, 50.0, 100.0, 5000.0, 120.0, 100.0],
        'refund_date': ['2023-10-03', '2023-10-04', '2023-10-06', '2023-10-03', '2023-10-16', '2023-12-25', '2023-10-04'],
        'refund_mode': ['Credit', 'Cash', 'Cash', 'Credit', 'UPI', 'Credit', 'Credit'], # R002 is UPI vs Cash mismatch
        'reason': ['Damaged', 'Size', 'Mind Change', 'Duplicate', 'Defect', 'Late', 'Duplicate']
    }

    pd.DataFrame(sales_data).to_csv('store_sales_header.csv', index=False)
    pd.DataFrame(refund_data).to_csv('refund_transactions.csv', index=False)
    print("Files Ready.")

# ==========================================
# 1. BRONZE LAYER (Raw Ingestion)
# ==========================================
def bronze_layer():
    print("\n [BRONZE] Ingesting Raw Data...")
    sales_raw = pd.read_csv('store_sales_header.csv')
    refunds_raw = pd.read_csv('refund_transactions.csv')
    return sales_raw, refunds_raw

# ==========================================
# 2. SILVER LAYER (Cleaning & Enrichment)
# ==========================================
def silver_layer(sales_df, refunds_df):
    print("[SILVER] Cleaning and Joining Data...")

    # 1. Type Conversion
    sales_df['date'] = pd.to_datetime(sales_df['date'])
    refunds_df['refund_date'] = pd.to_datetime(refunds_df['refund_date'])

    # 2. Enrichment (The Join)
    # Merging refunds with sales to get context (original amount, user, date)
    silver_df = pd.merge(
        refunds_df,
        sales_df,
        left_on='original_transaction_id',
        right_on='transaction_id',
        how='left',
        suffixes=('_refund', '_sales')
    )

    # 3. Handling basic data integrity (e.g., Fill NaNs if sale not found)
    silver_df['total_amount'] = silver_df['total_amount'].fillna(0)

    return silver_df

# ==========================================
# 3. GOLD LAYER (Business Logic & Fraud Flags)
# ==========================================
def gold_layer(df):
    print("[GOLD] Applying Fraud Logic & Anomaly Detection...")

    # Create a copy for the analytics view
    gold_df = df.copy()
    gold_df['fraud_reasons'] = ""
    gold_df['is_fraud'] = False

    # --- RULE 1: Refund Amount > Original Amount ---
    mask_val = gold_df['refund_amount'] > gold_df['total_amount']
    gold_df.loc[mask_val, 'fraud_reasons'] += "Value > Original; "
    gold_df.loc[mask_val, 'is_fraud'] = True

    # --- RULE 2: Customer Identity Mismatch ---
    # (Checking if the refunder is the same person who bought it)
    mask_id = gold_df['customer_id_refund'] != gold_df['customer_id_sales']
    gold_df.loc[mask_id, 'fraud_reasons'] += "Identity Mismatch; "
    gold_df.loc[mask_id, 'is_fraud'] = True

    # --- RULE 3: Payment Mode Mismatch ---
    # (Bought with UPI, Refunded via Cash? Suspicious.)
    mask_pay = gold_df['refund_mode'] != gold_df['payment_mode']
    # Filter out cases where original sale is missing (handled by ID mismatch)
    mask_pay = mask_pay & gold_df['payment_mode'].notna()
    gold_df.loc[mask_pay, 'fraud_reasons'] += "Payment Mode Mismatch; "
    gold_df.loc[mask_pay, 'is_fraud'] = True

    # --- RULE 4: Date Window (30 Days) ---
    gold_df['days_diff'] = (gold_df['refund_date'] - gold_df['date']).dt.days
    mask_date = gold_df['days_diff'] > 30
    gold_df.loc[mask_date, 'fraud_reasons'] += "Return Window Expired; "
    gold_df.loc[mask_date, 'is_fraud'] = True

    # --- RULE 5: Repeated Refunds for Same Product ---
    # Group by Original Transaction ID. If count > 1, it's a duplicate request.
    dup_counts = gold_df.groupby('original_transaction_id')['refund_id'].transform('count')
    mask_dup = dup_counts > 1
    gold_df.loc[mask_dup, 'fraud_reasons'] += "Duplicate Refund Request; "
    gold_df.loc[mask_dup, 'is_fraud'] = True

    # --- RULE 6: Statistical Anomaly (Z-Score) ---
    # "Use simple anomaly detection models (Z-score)"
    mean_val = gold_df['refund_amount'].mean()
    std_val = gold_df['refund_amount'].std()

    if std_val > 0:
        gold_df['z_score'] = (gold_df['refund_amount'] - mean_val) / std_val
    else:
        gold_df['z_score'] = 0

    # Flag if Z-Score > 1.96 (Approx 95% confidence interval)
    mask_anomaly = gold_df['z_score'].abs() > 1.96
    gold_df.loc[mask_anomaly, 'fraud_reasons'] += "Statistical Outlier (High Value); "
    gold_df.loc[mask_anomaly, 'is_fraud'] = True

    return gold_df

# ==========================================
# 4. ORCHESTRATOR
# ==========================================
def run_pipeline():
    # Setup
    generate_source_files()

    # Execution Flow
    bronze_sales, bronze_refunds = bronze_layer()
    silver_data = silver_layer(bronze_sales, bronze_refunds)
    gold_data = gold_layer(silver_data)

    # Reporting
    suspicious_transactions = gold_data[gold_data['is_fraud'] == True]

    print("\n  FRAUD DETECTION REPORT ")
    cols_to_show = ['refund_id', 'customer_id_refund', 'refund_amount', 'refund_mode', 'fraud_reasons', 'z_score']
    print(suspicious_transactions[cols_to_show].to_string(index=False))

    # Optional: Save for submission
    suspicious_transactions[cols_to_show].to_csv('fraud_flags.csv', index=False)
    print("\n Final Report saved to 'fraud_flags.csv'")

# Run it
run_pipeline()

[SOURCE] Generating dummy data files...
Files Ready.

 [BRONZE] Ingesting Raw Data...
[SILVER] Cleaning and Joining Data...
[GOLD] Applying Fraud Logic & Anomaly Detection...

  FRAUD DETECTION REPORT 
refund_id customer_id_refund  refund_amount refund_mode                                        fraud_reasons   z_score
     R001               C001          100.0      Credit                           Duplicate Refund Request;  -0.418684
     R002               C002          600.0        Cash            Value > Original; Payment Mode Mismatch;  -0.145799
     R003               C999           50.0        Cash                                  Identity Mismatch;  -0.445972
     R004               C001          100.0      Credit                           Duplicate Refund Request;  -0.418684
     R005               C004         5000.0         UPI Value > Original; Statistical Outlier (High Value);   2.255590
     R006               C005          120.0      Credit                             