# Fraud & Cyber-Threat Prediction â€” End-to-End Notebook
Goal:Predict fraudulent / cyber-risky transactions using transaction metadata, behavioral features, and market stress indicators.

Author: Milani Chikeka  

Seed:42
---
Sections
1. Setup & imports  
2. Load dataset (Kaggle `creditcard.csv` recommended) or simulate synthetic transactions  
3. Market stress synthetic enrichment (USD/ZAR returns, VIX proxy, repo rate changes)  
4. Feature engineering (behavioral + transaction + stress features)  
5. Train/test split & imbalance handling  
6. Models: Logistic Regression baseline + LightGBM (main)  
7. Evaluation: ROC, PR, confusion matrix, business metrics  
8. Explainability: SHAP plots  
9. Save model & preprocessing pipeline


In [None]:
#Setups and imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings  
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
from scikitlearn.model_selection import train_test_split,StratifiedKFold
from scikitlearn.preprocessing import StandardScaler,OneHotEncoder
from scikitlearn.compose import ColumnTransformer
from scikitlearn.pipeline import Pipeline
from scikitlearn.metrics import (roc_auc_score, precision_recall_curve,average_precision_score ,confusion_matrix,classification_report)
from scikitlearn.linear_model import LogisticRegression
from scikitlearn.ensemble import RandomForestClassifier

# Optional libraies and seedings
try:
    import lightgbm as lgb
except Exception as e:
    print("install lightgbm: pip install lightgbm")
raise e

try:
    import shap 
except Exception as e:
    print("install shap: pip install shap")
    raise e

#This is for imbalanbce handling.
try:
    from imblearn.over_sampling import SMOTE
except Exception as e:
    print("Install imbalanced-learn: pip install imbalanced-learn")    
    raise e

RandoomSeed=42
np.random.seed(RandoomSeed)

#Style plotting
sns.set(style="whitegrid")
    

ModuleNotFoundError: No module named 'seaborn'

#2Load the date/create synthetic data.
-Load the Kaggle dataset `creditcard.csv`, put it in `./data/creditcard.csv`.
-If dataset is not present, create 'synthetic' transaction datasets.

In [None]:
#In this cell, the dataset is loaded and basic EDA is performed.
DataDir="data"
os.makedirs(DataDir,exist_ok=True)
KaggleDataset=os.path.join(DataDir, "KaggleDataset.csv")

if os.path.exists(KaggleDataset):
    print("Loading dataset from local directory.")
    df=pd.read_csv(KaggleDataset)
    #Kaggle datasets has "TIME", "Amount" , and "Class" columns.
    #Create synthetic categorical features for demonstration.
    df=df.reset_index(drop=True)
    #A synthetic timeframe will be created
    StartDate=datetime(2020,1,1)
    df['timestamp']=df['Time'].apply(lambda x: StartDate + timedelta(seconds=int(x)))
    #Categorical features are created
    df['DeviceType']=np.random.choice(['Mobile','Desktop','Tablet'], size=n, p=[0.5,0.1,0.4])
    df['Browser']=np.random.choice(['Chrome', 'Firefox', 'Safari', 'Edge'], size=n, p=[0.4,0.1,0.4,0.1])
    df['MerchantCategory']=np.random.choice(['Retail', 'Food', 'Travel', 'Entertainment','Health'], size=n, p=[0.3,0.2,0.2,0.2,0.1])
    df['Country']=np.random.choice(['ZA','UK', 'US', 'CHN', 'IND'], size=n, p=[0.3,0.2,0.2,0.2,0.1])
    df=df.rename(columns={'Amount':'amount','Class':'is_fraud'})
    
else:
    print("Dataset not found. Will create a synthetic dataset.")
    n=20000
    #Simulate the users base
    users=np.random.randint(1,20000, size=n)
    StartDate=datetime(2020,1,1)
    timestamps=[StartDate + timedelta(seconds=int(x)) 
                for x in np.random.exponential(scale=3600, size=n).cumsum()]
    amounts=np.random.exponential(scale=100, size=n)
#Labeling transactions as fraud or not based on amount and random noise
IsFraud=(np.random.rand(n) < 0.002).astype(int) #Around 0.2% frauds
DeviceType=np.random.choice(['Mobile','Desktop', 'Tablet'], size=n, p=[0.5,0.1,0.4])
Browser=np.random.choice(['Chrome', 'Firefox', 'Safari', 'Edge'], size=n, p=[0.4,0.1,0.4,0.1])
MerchantCategory=np.random.choice(['Retail', 'Food', 'Travel', 'Entertainment','Health'], size=n, p=[0.3,0.2,0.2,0.2,0.1])
Country=np.random.choice(['ZA','UK', 'US', 'CHN', 'IND'], size=n, p=[0.3,0.2,0.2,0.2,0.1])

df=pd.DataFrame({
    'users': users,
    'timestamp': timestamps,
    'amount': amounts,
    'is_fraud': IsFraud,
    'DeviceType': DeviceType,
    'Browser': Browser,
    'MerchantCategory': MerchantCategory,
    'Country': Country,
})


df.head()

 #3 Synthetic Market Stress Enrichment
We will create a daily market stress series with:
- USD/ZAR returns
- VIX proxy (global vol)
- SARB repo rate change flags

Then merge the daily metrics onto each transaction by date.

In [None]:
#Create market stress data
#create a date range that covres the transaction timestamps
min_date=df['timestamp'].min().date()
max_date=df['timestamp'].max().date()
dates=pd.date_range(start=min_date, end=max_date)

# Simulate USD/ZAR daily returns (random walk with occasional shocks)
np.random.seed(42)
UsdZarLog=np.random.normal(loc=0, scale=0.01, size=len(dates))
#Add random shocks
Shock=np.random.choice(len(dates), size=int(len(dates)*0.05*len(dates)), replace=False)
UsdZarLog[Shock] += np.random.normal(loc=0.05, scale=0.02, size=len(Shock))

UsdZarLog=16.76*np.exp(np.cumsum(UsdZarLog)) #Starting rate around 16.76

