# Fraud & Cyber-Threat Prediction — End-to-End Notebook
Goal:Predict fraudulent / cyber-risky transactions using transaction metadata, behavioral features, and market stress indicators.

Author: Milani Chikeka  

Seed:42
---
Sections
1. Setup & imports  
2. Load dataset (Kaggle `creditcard.csv` recommended) or simulate synthetic transactions  
3. Market stress synthetic enrichment (USD/ZAR returns, VIX proxy, repo rate changes)  
4. Feature engineering (behavioral + transaction + stress features)  
5. Train/test split & imbalance handling  
6. Models: Logistic Regression baseline + LightGBM (main)  
7. Evaluation: ROC, PR, confusion matrix, business metrics  
8. Explainability: SHAP plots  
9. Save model & preprocessing pipeline


In [None]:
#Setups and imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings  
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
from scikitlearn.model_selection import train_test_split,StratifiedKFold
from scikitlearn.preprocessing import StandardScaler,OneHotEncoder
from scikitlearn.compose import ColumnTransformer
from scikitlearn.pipeline import Pipeline
from scikitlearn.metrics import (roc_auc_score, precision_recall_curve,average_precision_score ,confusion_matrix,classification_report)
from scikitlearn.linear_model import LogisticRegression
from scikitlearn.ensemble import RandomForestClassifier

# Optional libraies and seedings
try:
    import lightgbm as lgb
except Exception as e:
    print("install lightgbm: pip install lightgbm")
raise e

try:
    import shap 
except Exception as e:
    print("install shap: pip install shap")
    raise e

#This is for imbalanbce handling.
try:
    from imblearn.over_sampling import SMOTE
except Exception as e:
    print("Install imbalanced-learn: pip install imbalanced-learn")    
    raise e

RandoomSeed=42
np.random.seed(RandoomSeed)

#Style plotting
sns.set(style="whitegrid")
    

ModuleNotFoundError: No module named 'seaborn'

#2Load the date/create synthetic data.
-Load the Kaggle dataset `creditcard.csv`, put it in `./data/creditcard.csv`.
-If dataset is not present, create 'synthetic' transaction datasets.

In [None]:
#In this cell, the dataset is loaded and basic EDA is performed.
DataDir="data"
os.makedirs(DataDir,exist_ok=True)
KaggleDataset=os.path.join(DataDir, "KaggleDataset.csv")

if os.path.exists(KaggleDataset):
    print("Loading dataset from local directory.")
    df=pd.read_csv(KaggleDataset)
    #Kaggle datasets has "TIME", "Amount" , and "Class" columns.
    #Create synthetic categorical features for demonstration.
    df=df.reset_index(drop=True)
    #A synthetic timeframe will be created
    StartDate=datetime(2020,1,1)
    df['timestamp']=df['Time'].apply(lambda x: StartDate + timedelta(seconds=int(x)))
    #Categorical features are created
    df['DeviceType']=np.random.choice(['Mobile','Desktop','Tablet'], size=n, p=[0.5,0.1,0.4])
    df['Browser']=np.random.choice(['Chrome', 'Firefox', 'Safari', 'Edge'], size=n, p=[0.4,0.1,0.4,0.1])
    df['MerchantCategory']=np.random.choice(['Retail', 'Food', 'Travel', 'Entertainment','Health'], size=n, p=[0.3,0.2,0.2,0.2,0.1])
    df['Country']=np.random.choice(['ZA','UK', 'US', 'CHN', 'IND'], size=n, p=[0.3,0.2,0.2,0.2,0.1])
    df=df.rename(columns={'Amount':'amount','Class':'is_fraud'})
    
else:
    print("Dataset not found. Will create a synthetic dataset.")
    n=20000
    #Simulate the users base
    users=np.random.randint(1,20000, size=n)
    StartDate=datetime(2020,1,1)
    timestamps=[StartDate + timedelta(seconds=int(x)) 
                for x in np.random.exponential(scale=3600, size=n).cumsum()]
    amounts=np.random.exponential(scale=100, size=n)
#Labeling transactions as fraud or not based on amount and random noise
IsFraud=(np.random.rand(n) < 0.002).astype(int) #Around 0.2% frauds
DeviceType=np.random.choice(['Mobile','Desktop', 'Tablet'], size=n, p=[0.5,0.1,0.4])
Browser=np.random.choice(['Chrome', 'Firefox', 'Safari', 'Edge'], size=n, p=[0.4,0.1,0.4,0.1])
MerchantCategory=np.random.choice(['Retail', 'Food', 'Travel', 'Entertainment','Health'], size=n, p=[0.3,0.2,0.2,0.2,0.1])
Country=np.random.choice(['ZA','UK', 'US', 'CHN', 'IND'], size=n, p=[0.3,0.2,0.2,0.2,0.1])

df=pd.DataFrame({
    'users': users,
    'timestamp': timestamps,
    'amount': amounts,
    'is_fraud': IsFraud,
    'DeviceType': DeviceType,
    'Browser': Browser,
    'MerchantCategory': MerchantCategory,
    'Country': Country,
})


df.head()

 #3 Synthetic Market Stress Enrichment
We will create a daily market stress series with:
- USD/ZAR returns
- VIX proxy (global vol)
- SARB repo rate change flags

Then merge the daily metrics onto each transaction by date.

In [None]:
#Create market stress data
#create a date range that covres the transaction timestamps
min_date=df['timestamp'].min().date()
max_date=df['timestamp'].max().date()
dates=pd.date_range(start=min_date, end=max_date)

# Simulate USD/ZAR daily returns (random walk with occasional shocks)
np.random.seed(42)
UsdZarLog=np.random.normal(loc=0, scale=0.01, size=len(dates))
#Add random shocks
Shock=np.random.choice(len(dates), size=int(len(dates)*0.05*len(dates)), replace=False)
UsdZarLog[Shock] += np.random.normal(loc=0.05, scale=0.02, size=len(Shock))

UsdZarLog=16.76*np.exp(np.cumsum(UsdZarLog)) #Starting rate around 16.76

#VIX index simulation
VixIndex=np.abs(np.random.normal(loc=12, size=len(dates)))
VixIndex[Shock] += np.random.normal(10, 5, size=len(Shock))
VixIndex=np.clip(VixIndex,10,None)

#repo rate simulation
RepoRate=np.full(len(dates),0.0675 ) #starts at 6.75%
ChangeIndixes=np.random.choice(len(dates), size=int(0.002*len(dates)), replace=False)
for idx in ChangeIndixes:
    RepoRate[idx:] += np.random.choice([0.25, -0.25, 0.5, -0.5]) #Changes in basis points

MarketData=pd.DataFrame({
    'date': dates,
    'UsdZar': UsdZarLog,
    'VixIndex': VixIndex,
    'RepoRate': RepoRate
})

#Compute the returns and volatility
MarketData['UsdzarReturn']=MarketData['UsdZar'].pct_change().fillna(0)
MarketData['UsdZarReturn7DayVol']=MarketData['UsdzarReturn'].rolling(7, min_periods=1).std().fillna(0)
MarketData['UsdZar7DayMean']=MarketData['VixIndex'].rolling(7, min_periods=1).mean().fillna(MarketData['VixIndex'])
#Mark the stress when UsdZar is less than -2% or VIX index is above 20 or Repo rate above 8%
MarketData['MarketStress']=((MarketData['UsdzarReturn'].abs() > 0.02) | (MarketData['VixIndex'] > MarketData['VixIndex'].quantile(0.90))).astype(int)

MarketData.head()

Merge market features into transactions (by date).


In [None]:
#Merge market data with transaction data by date
df['date']=pd.to_datetime(df['timestamp'].dt.date)
df=df.merge(MarketData, on='date', how='left')

#Fill empty market data for missing dates
df['UsdZar', 'VixIndex', 'RepoRate', 'UsdZarReturn', 'UsdZarReturn7DayVol', 'UsdZar7DayMean', 'MarketStress']=df[['UsdZar', 'VixIndex', 'RepoRate', 'UsdZarReturn', 'UsdZarReturn7DayVol', 'UsdZar7DayMean', 'MarketStress']].fillna(method='ffill').fillna(0)
df.head()

#4 Feature engineering — transactional & behavioral
We create:
- time features (hour, weekday)
- log amount
- rolling features per user (1h / 24h counts, avg amounts)
- device-country mismatch flag
- anomaly score: amount compared to user's historical mean


In [None]:
#Feature Engineering
df['hour']=df.to_datetime(df['timestamp']).dt.hour
df['weekday']=df.to_datetime(df['timestamp']).dt.weekday
df['AmountLog']=np.log(df['amount'])

#Sort by user and timestamp
if 'UserId' not in df.columns:
    #create fake user ids if not present in the dataset
    df['UserId']=np.random.randint(1, 20000, size=len(df))
    df=df.sort_values(by=['UserId', 'timestamp']).reset_index(drop=True)
        
# Rolling features (user-level)
# We will compute rolling counts in the past 1 hour and 24 hours using a simple groupby + expanding approach
# For efficiency, compute per user - using windows via timestamps       

def build_users_rolling_features(df, seconds_window):
    col_name=f'UserTransCount_{seconds_window//3600}h'
    Out=[]
    
    for uid, group in df.groupby('UserId'):
        Times=group['timestamp'].astype(np.int64) // 1_000_000_000  #Convert to seconds
        Counts=[]
        left=0
        for i, t in enumerate(Times):
        #Move left pointer to maintain the window
            while left < i and (t - Times.iloc[left]) > seconds_window:
                left += 1
            Counts.append(i - left) #Number of transactions in the window
        Out.extend(Counts)
    df[col_name]=Out
    
#user historical stats(means and std)    
UserStats=df.groupby('UserId')['amount'].agg(['mean','std']).rename(columns={'mean':'UserAmountMean','std':'UserAmountStd'})
df=df.merge(UserStats, on='UserId', right_index=True, how='left')

# Amount anomaly score (z-score)
df['AmtZScore']=(df['amount']-df['UserAmountMean'])/df['UserAmountStd'].replace(0,np.nan)
df['AmtZScore']=df['AmtZScore'].fillna(0)

# device-country mismatch flag: assume user primary country is mode country
UserCountry=df.groupby('UserId')['Country'].agg(lambda x: x.mode()[0]).rename('UserPrimaryCountry')
df=df.merge(UserCountry, on='UserId', right_index=True, how='left')
df['DeviceCountryMismatch']=(df['Country'] != df['UserPrimaryCountry']).astype(int)

#Flagging for night transactions (between 12 AM to 6 AM)
df['IsNightTransaction']=df['hour'].apply(lambda x: 1 if (x >0 & x < 6) else 0)

#Short-term rolling features
FeatCots=['Amount', 'AmountLog', 'hour', 'weekday', 'user_tx_count_1h', 'user_tx_count_24h', 
             'UserAmountMean','AmtZScore','DeviceType','MerchantCategory','Country',
             'UsdZarReturn','UsdZarReturn7DayVol','VixIndex','UsdZar7DayMean','RepoRate','MarketStress','IsNightTransaction','DeviceCountryMismatch']
len(df), df[FeatCots].head()

 #5 Prepare training dataset
- Choose modeling features.
- Encode categoricals using a ColumnTransformer pipeline.
- Train/test split stratified by label.
