### **Fraud Detection Predictive Model**
Goal: Build 2 separate classifiers to be stacked alongside each other subsequently for a stronger predictive model. 

TabNet will be used as one of the models, which supports class weighting through loss_fn parameter. Thus, the second model should also support class weighting directly for stacking to work effectively.

To complement TabNet strengths, gradient boosting methods emerge as the top candidate.

First, load necessary libraries

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import xgboost as xgb
from xgboost import XGBClassifier
#!pip install imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, f1_score, precision_score, recall_score

In [None]:
# Load cleaned data
df = pd.read_csv('cleaned_data.csv')

# Load mcc data
json_file_path = "/Users/jiajue/Documents/McGill/Winter Term/INSY695/Group project/Fraud data/mcc_codes.json"
with open(json_file_path, "r") as file:
    mcc = json.load(file)

# Convert to df
mcc = pd.DataFrame.from_dict(mcc, orient='index').reset_index()
mcc['index'] = mcc['index'].astype(int)

# Merge df and mcc into single df
df = df.merge(mcc, left_on='mcc', right_on='index', how='left')
df = df.rename(columns={0: 'merchant information'})
df.drop(columns=['index'], inplace=True)

df.info()

In [None]:
# Descriptive stats for each column
pd.set_option('display.max_columns', None)
df.describe()

In [None]:
# Extract day of week data for each observation for subsequent individual purchase behaviour assessment
df['date'] = pd.to_datetime(df['date'])
df['transaction_day'] = df['date'].dt.dayofweek

# Group by date and count fraud
frauds_time = df.groupby(pd.Grouper(key='date', freq='Y'))['Target'].sum()

plt.figure(figsize=(12, 6))
sns.lineplot(x=frauds_time.index, y=frauds_time.values)
plt.title('Fraud Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Fraud Cases')
plt.show()

# No value in keeping date as it is

In [None]:
# Drop irrelevant columns
df = df.drop(df.columns[[0,3,5,8,12,13]], axis=1)
df.info()

#### **Feature engineering**

#### Assess from merchant pov

In [None]:
# Group by mcc and calculate overall fraud rate
fraud_mcc = df.groupby('merchant information')['Target'].mean().sort_values(ascending=False)

# Visualise fraud rate for each merchant
plt.figure(figsize=(12, 6))
sns.barplot(x=fraud_mcc.index, y=fraud_mcc.values)
plt.title('Fraud by MCC')
plt.xlabel('MCC')
plt.ylabel('Fraud Count')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Binary col for high risk mcc - disregarded this in favour of a risk score below

# Filter out high risk mcc according to fraud rate with top 10% as cut off, and transaction amount with whisker as cut off, and the freq of fraud 
def high_risk_mcc_by_transact_type(df):
    """
    Filters out MCCs with a high risk of fraud considering transaction types using upper whisker value as the benchmark cut off
    Rationale: Amount quartiles are different across transaction types (refer to EDA) and thus, the high risk mcc can be significantly different from 1 to the other
    Returns: List of MCCs to filter out.
    """
    transaction_types =['Chip Transaction', 'Online Transaction', 'Swipe Transaction']
    high_risk_mccs = []

    for t in transaction_types:
        # Split into respective transaction type
        df_type = df[df['use_chip_' + t] == 1]
        # Calculate fraud rate for each mcc
        fraud_mcc = df_type.groupby('merchant information')['Target'].mean().sort_values(ascending=False)
        # Filter out high risk mccs in the top 10%
        high_risk_mccs_fraud = fraud_mcc.head(int(len(fraud_mcc) * 0.1)).index.to_list()

        # Calculate quartiles for amounts per merch
        mcc_quartiles = df_type.groupby('merchant information')['amount'].quantile([0.25, 0.75]).unstack()
        mcc_quartiles.columns = ['Q1', 'Q3']
        mcc_quartiles['IQR'] = mcc_quartiles['Q3'] - mcc_quartiles['Q1']
        mcc_quartiles['amount_threshold'] = mcc_quartiles['Q3'] + 1.5 * mcc_quartiles['IQR'] # upper whisker value

        # Count occurence of high transaction amounts per merchant
        df_merged = pd.merge(df_type, mcc_quartiles, on='merchant information', how='left')
        df_merged['high_amount'] = (df_merged['amount'] >= df_merged['amount_threshold']).astype(int)
        high_amount_counts = df_merged.groupby('merchant information')['high_amount'].sum().reset_index()
        high_amount_counts.rename(columns={'high_amount': 'high_amount_count'}, inplace=True)

        # Calculate the ratio of high transaction amount relative to total transactions
        total_transaction_counts = df_type.groupby('merchant information').size().reset_index(name='total_transactions') # number of transactions per merchant
        mcc_ratios = pd.merge(high_amount_counts, total_transaction_counts, on='merchant information')
        mcc_ratios['high_amount_ratio'] = mcc_ratios['high_amount_count'] / mcc_ratios['total_transactions']

        # Filter merchant on threshold of 75%
        high_risk_mccs_amt = mcc_ratios[mcc_ratios['high_amount_ratio'] > 0.77]['merchant information'].tolist()

        # Combine high risk mcc for each transaction type 
        high_risk_mccs_type = list(set(high_risk_mccs_amt + high_risk_mccs_fraud))
        high_risk_mccs.extend(high_risk_mccs_type)
    
    # Combine mccs across all transaction types
    high_risk_mccs = list(set(high_risk_mccs))
    return high_risk_mccs

#high_risk_mccs = high_risk_mcc_by_transact_type(df)

#print(f'High risk merchants include: {high_risk_mccs}')

# Create new column for high risk merchant
#df['high_risk_merchant'] = df['merchant information'].isin(high_risk_mccs).astype(int)

In [None]:
# Assign risk score to each merchant based on fraud rate and transaction amount instead of a binary col that indicates if the merchant is high risk or not to capture more information about each merch
def calculate_merchant_risk_score(df):
    """
    Calculates a continuous risk score for each merchant based on fraud rate, high-value transaction frequency, and transaction amount statistics.
    Returns: DataFrame with merchant information and risk scores.
    """
    transaction_types = ['Chip Transaction', 'Online Transaction', 'Swipe Transaction']
    merchant_risk_scores = []

    for t in transaction_types:
        # Split into respective transaction type
        df_type = df[df['use_chip_' + t] == 1]

        # Calculate fraud rate for each merchant
        fraud_mcc = df_type.groupby('merchant information')['Target'].mean().reset_index()
        fraud_mcc.rename(columns={'Target': 'fraud_rate'}, inplace=True)

        # Calculate quartiles for amounts per merchant
        mcc_quartiles = df_type.groupby('merchant information')['amount'].quantile([0.25, 0.75]).unstack()
        mcc_quartiles.columns = ['Q1', 'Q3']
        mcc_quartiles['IQR'] = mcc_quartiles['Q3'] - mcc_quartiles['Q1']
        mcc_quartiles['amount_threshold'] = mcc_quartiles['Q3'] + 1.5 * mcc_quartiles['IQR']  # upper whisker value

        # Count occurrence of high transaction amounts per merchant
        df_merged = pd.merge(df_type, mcc_quartiles, on='merchant information', how='left')
        df_merged['high_amount'] = (df_merged['amount'] >= df_merged['amount_threshold']).astype(int)
        high_amount_counts = df_merged.groupby('merchant information')['high_amount'].sum().reset_index()
        high_amount_counts.rename(columns={'high_amount': 'high_amount_count'}, inplace=True)

        # Calculate the ratio of high transaction amount relative to total transactions
        total_transaction_counts = df_type.groupby('merchant information').size().reset_index(name='total_transactions')
        mcc_ratios = pd.merge(high_amount_counts, total_transaction_counts, on='merchant information')
        mcc_ratios['high_amount_ratio'] = mcc_ratios['high_amount_count'] / mcc_ratios['total_transactions']

        # Merge fraud rate and high-value transaction ratio
        merchant_risk = pd.merge(fraud_mcc, mcc_ratios, on='merchant information', how='left')

        # Normalize factors to a 0-1 scale
        merchant_risk['fraud_rate_norm'] = (merchant_risk['fraud_rate'] - merchant_risk['fraud_rate'].min()) / (merchant_risk['fraud_rate'].max() - merchant_risk['fraud_rate'].min())
        merchant_risk['high_amount_ratio_norm'] = (merchant_risk['high_amount_ratio'] - merchant_risk['high_amount_ratio'].min()) / (merchant_risk['high_amount_ratio'].max() - merchant_risk['high_amount_ratio'].min())

        # Assign weights and calculate risk score for each merchant
        weights = {'fraud_rate': 0.6, 'high_amount_ratio': 0.4}  # Adjust weights as needed
        merchant_risk['risk_score'] = (
            merchant_risk['fraud_rate_norm'] * weights['fraud_rate'] +
            merchant_risk['high_amount_ratio_norm'] * weights['high_amount_ratio']
        )

        # Append to the list of merchant risk scores
        merchant_risk_scores.append(merchant_risk[['merchant information', 'risk_score']])

    # Combine risk scores across all transaction types
    merchant_risk_scores = pd.concat(merchant_risk_scores).groupby('merchant information')['risk_score'].max().reset_index()

    return merchant_risk_scores

# Calculate merchant risk scores
#merchant_risk_scores = calculate_merchant_risk_score(df)

# Merge risk scores back into the main dataframe
#df = pd.merge(df, merchant_risk_scores, on='merchant information', how='left')

# Display the updated dataframe
#print(df[['merchant information', 'risk_score']].head())

In [None]:
# Improved code chunk for merchant risk assessment
def calculate_merchant_risk_score(df):
    """
    Calculates a continuous risk score for each merchant based on fraud rate, high-value transaction frequency, and transaction amount statistics.
    Returns: DataFrame with merchant information and risk scores.
    """
    transaction_types = ['Chip Transaction', 'Online Transaction', 'Swipe Transaction']
    merchant_risk_scores = []

    for t in transaction_types:
        # Split into respective transaction type
        df_type = df[df['use_chip_' + t] == 1]

        # Calculate fraud rate for each merchant
        fraud_mcc = df_type.groupby('merchant information')['Target'].mean().reset_index()
        fraud_mcc.rename(columns={'Target': 'fraud_rate'}, inplace=True)

        # Calculate quartiles for amounts per merchant
        mcc_quartiles = df_type.groupby('merchant information')['amount'].quantile([0.25, 0.75]).unstack()
        mcc_quartiles.columns = ['Q1', 'Q3']
        mcc_quartiles['IQR'] = mcc_quartiles['Q3'] - mcc_quartiles['Q1']
        mcc_quartiles['amount_threshold'] = mcc_quartiles['Q3'] + 1.5 * mcc_quartiles['IQR']  # upper whisker value

        # Count occurrence of high transaction amounts per merchant
        df_merged = pd.merge(df_type, mcc_quartiles, on='merchant information', how='left')
        df_merged['high_amount'] = (df_merged['amount'] >= df_merged['amount_threshold']).astype(int)
        high_amount_counts = df_merged.groupby('merchant information')['high_amount'].sum().reset_index()
        high_amount_counts.rename(columns={'high_amount': 'high_amount_count'}, inplace=True)

        # Calculate the ratio of high transaction amount relative to total transactions
        total_transaction_counts = df_type.groupby('merchant information').size().reset_index(name='total_transactions')
        mcc_ratios = pd.merge(high_amount_counts, total_transaction_counts, on='merchant information')
        mcc_ratios['high_amount_ratio'] = mcc_ratios['high_amount_count'] / mcc_ratios['total_transactions']

        # Calculate average transaction amount per merchant
        mcc_avg_amt = df_type.groupby('merchant information')['amount'].mean().reset_index()
        mcc_avg_amt.rename(columns={'amount': 'avg_transaction_amt'}, inplace=True)

        # Merge all merchant features
        merchant_risk = pd.merge(fraud_mcc, mcc_ratios, on='merchant information', how='left')
        merchant_risk = pd.merge(merchant_risk, mcc_avg_amt, on='merchant information', how='left')

        # Normalize avg_transaction_amt
        min_amt = merchant_risk['avg_transaction_amt'].min()
        max_amt = merchant_risk['avg_transaction_amt'].max()
        if max_amt > min_amt:
            merchant_risk['avg_transaction_amt_norm'] = (merchant_risk['avg_transaction_amt'] - min_amt) / (max_amt - min_amt)
        else:
            merchant_risk['avg_transaction_amt_norm'] = 0  # Handle division by zero

        # Assign weights and calculate risk score for each merchant
        weights = {'fraud_rate': 0.5, 'high_amount_ratio': 0.3, 'avg_transaction_amt_norm': 0.2}  # Adjust weights as needed
        merchant_risk['risk_score'] = (
            merchant_risk['fraud_rate'] * weights['fraud_rate'] +
            merchant_risk['high_amount_ratio'] * weights['high_amount_ratio'] +
            merchant_risk['avg_transaction_amt_norm'] * weights['avg_transaction_amt_norm']
        )

        # Append to the list of merchant risk scores
        merchant_risk_scores.append(merchant_risk[['merchant information', 'risk_score', 'total_transactions', 'avg_transaction_amt']])

    # Combine risk scores across all transaction types
    merchant_risk_scores = pd.concat(merchant_risk_scores).groupby('merchant information').apply(
        lambda x: pd.Series({
            'risk_score': np.average(x['risk_score'], weights=x['total_transactions']),
            'total_transactions': x['total_transactions'].sum(),
            'avg_transaction_amt': np.average(x['avg_transaction_amt'], weights=x['total_transactions'])
        })
    ).reset_index()

    # Fill missing values if any
    merchant_risk_scores = merchant_risk_scores.fillna(0)

    return merchant_risk_scores

# Calculate merchant risk scores
merchant_risk_scores = calculate_merchant_risk_score(df)

# Merge risk scores back into the main dataframe
df = pd.merge(df, merchant_risk_scores, on='merchant information', how='left')

# Display the updated dataframe
print(df[['merchant information', 'risk_score', 'total_transactions', 'avg_transaction_amt']].head())

In [None]:
df.head()

#### Assess from individual baseline purchase behaviours

In [None]:
# Assess based on offline and in person transaction behaviours: in person to consider distance as well. To consider amount, time and day of purchase in each transaction type
#!pip install geopy
from geopy.geocoders import Nominatim # very slow, will change to photon through api
from geopy.distance import geodesic
import requests
from sklearn.ensemble import IsolationForest
#!pip install swifter
import swifter

# Precompute merchant coordinates
#geolocator = Nominatim(user_agent="fraud_detection_app")

# Photon API endpoint
PHOTON_API_URL = 'https://photon.komoot.io/api/'

# Function to geocode a city using Photon API
def geocode_city(city):
    try:
        response = requests.get(PHOTON_API_URL, params={"q": city, "limit": 1}) # return only 1 relevant result for each coordinate
        if response.status_code == 200:
            results = response.json()
            if results['features']:
                location = results['features'][0]['geometry']['coordinates']
                return (location[1], location[0])  # Photon returns (lon, lat)
        return None
    except Exception as e:
        print(f"Error geocoding {city}: {e}")
        return None

# Get unique cities
#unique_cities = df['merchant_city'].unique()
#city_coords = {}

#for city in unique_cities:
 #   coords = geocode_city(city)
  #  if coords:
   #     city_coords[city] = coords
    #else:
     #   print(f"Could not geocode city: {city}")

# Save city_coords to a file
#with open('city_coords.json', 'w') as f:
 #   json.dump(city_coords, f)

# Load precomputed coordinates
#with open('city_coords.json', 'r') as f:
 #   city_coords = json.load(f)

# Define the distance calculation function
def calculate_distance_merch(row, city_coords):
    client_location = (row['latitude'], row['longitude'])
    merchant_city = row['merchant_city']
    
    if merchant_city in city_coords:
        merchant_coords = city_coords[merchant_city]
        distance = geodesic(client_location, merchant_coords).km
        return distance
    else:
        return None  # Handle missing coordinates

'''
COORDINATES OF MERCHANT CITY AND DISTANCE BETWEEN CLIENTS AND MERCHANT WILL NOT BE CALCULATED ANYMORE DUE TO THE NEED FOR OVERLY INTENSIVE COMPUTATIONAL RESOURCES 
'''

def flag_potential_fraud_indiv(df):
    '''Flags potential fraud transactions based on individual purchasing behaviour, considering online vs offline differences'''
    
    # --- Metrics regardless of transaction nature ---
    
    # Historical fraud rate per client
    df['client_fraud_rate'] = df.groupby('client_id')['Target'].transform('mean')
    
    # Transaction frequency per client
    df['client_transaction_freq'] = df.groupby('client_id')['amount'].transform('count')
    
    # Time since last transaction per client
    df['time_since_last_txn'] = df.groupby('client_id')['date'].diff().dt.total_seconds() / 3600  # Convert to hours
    df['time_since_last_txn'] = df['time_since_last_txn'].fillna(0)  # Fill 0 for first transaction

    # Calculate financial ratios
    df['amt_income_ratio'] = df['amount'] / df['yearly_income']
    df['debt_income_ratio'] = df['total_debt'] / df['yearly_income']

    # --- Split into Online and Offline Transactions ---
    df_online = df[df['use_chip_Online Transaction'] == 1]
    df_offline = df[df['use_chip_Online Transaction'] != 1]

    # Function to map hours to categories
    def hour_to_cat(hour):
        if 6 <= hour < 12:
            return 'Morning'
        elif 12 <= hour < 17:
            return 'Afternoon'
        elif 17 <= hour < 22:
            return 'Evening'
        else:
            return 'Night'

    # --- Online Transactions: Additional Features ---
    if not df_online.empty:
        df_online['hour_category'] = df_online['transaction_hour'].apply(hour_to_cat)
        # Calculate average transaction amount for online transactions
        df_online['avg_transaction_amt'] = df_online.groupby('client_id')['amount'].transform('mean')
        df_online['amt_avg_ratio'] = df_online['amount'] / df_online['avg_transaction_amt']

        # Arrange transactions according to time and day for each user
        df_online['weekend'] = df_online['transaction_day'].apply(lambda x: 1 if x >= 5 else 0)  # 5 for Sat, 6 for Sun
        df_online['weekday_hour_category'] = df_online.apply(lambda row: f"weekend_{row['hour_category']}" if row['weekend'] == 1 else f"weekday_{row['hour_category']}", axis=1)

        # Calculate % of transactions per weekday/weekend category combinations for each user
        user_purchase_behaviour = df_online.groupby(['client_id', 'weekday_hour_category']).size().unstack(fill_value=0)
        user_purchase_behaviour = user_purchase_behaviour.div(user_purchase_behaviour.sum(axis=1), axis=0)  # Normalize to %
        df_online = df_online.merge(user_purchase_behaviour.stack().reset_index(name='weekday_hour_category_freq'), on=['client_id', 'weekday_hour_category'], how='left')

        # Normalize features for typicality score
        df_online['weekday_hour_category_freq_norm'] = (df_online['weekday_hour_category_freq'] - df_online['weekday_hour_category_freq'].min()) / (df_online['weekday_hour_category_freq'].max() - df_online['weekday_hour_category_freq'].min())
        df_online['amt_avg_ratio_norm'] = (df_online['amt_avg_ratio'] - df_online['amt_avg_ratio'].min()) / (df_online['amt_avg_ratio'].max() - df_online['amt_avg_ratio'].min())
        df_online['time_since_last_txn_norm'] = (df_online['time_since_last_txn'] - df_online['time_since_last_txn'].min()) / (df_online['time_since_last_txn'].max() - df_online['time_since_last_txn'].min())

        # Define weights for typicality score
        weights = {
            'weekday_hour_category_freq': 0.4,  # Frequency-based feature
            'amt_avg_ratio': 0.3,               # Transaction amount relative to average
            'time_since_last_txn': 0.2,         # Time since last transaction
            'client_fraud_rate': 0.1            # Historical fraud rate
        }

        # Calculate the weighted typicality score
        df_online['typicality_score'] = (
            (1 - df_online['weekday_hour_category_freq_norm']) * weights['weekday_hour_category_freq'] +
            df_online['amt_avg_ratio_norm'] * weights['amt_avg_ratio'] +
            df_online['time_since_last_txn_norm'] * weights['time_since_last_txn'] +
            df_online['client_fraud_rate'] * weights['client_fraud_rate']
        )

        # Dummify hour category and weekday hour category
        df_online = pd.get_dummies(df_online, columns=['hour_category'], prefix='hour', dtype=int)
        df_online = pd.get_dummies(df_online, columns=['weekday_hour_category'], prefix='time_day', dtype=int)

        # Flag anomalous transactions for online
        df_online['suspicious_indiv_activity'] = 0
        for client_id, client_data in df_online.groupby('client_id'):
            features = [
                'client_fraud_rate',
                'client_transaction_freq',
                'time_since_last_txn',
                'amount',
                'weekend',  # Include weekend feature
                'typicality_score',
                'amt_income_ratio',
                'debt_income_ratio',
                'amt_avg_ratio'
            ] + [col for col in df_online.columns if col.startswith('hour_') or col.startswith('time_day_')]
            
            client_features = client_data[features]
            isoforest = IsolationForest(contamination=0.05, random_state=42)
            isoforest.fit(client_features)
            anomaly_prediction = isoforest.predict(client_features)
            df_online.loc[client_data.index, 'suspicious_indiv_activity'] = (anomaly_prediction == -1).astype(int)

    # --- Offline Transactions: Additional Features ---
    if not df_offline.empty:
        df_offline['hour_category'] = df_offline['transaction_hour'].apply(hour_to_cat)
        # Calculate average transaction amount for offline transactions
        df_offline['avg_transaction_amt'] = df_offline.groupby('client_id')['amount'].transform('mean')
        df_offline['amt_avg_ratio'] = df_offline['amount'] / df_offline['avg_transaction_amt']

        # Arrange transactions according to time and day for each user
        df_offline['weekend'] = df_offline['transaction_day'].apply(lambda x: 1 if x >= 5 else 0)  # 5 for Sat, 6 for Sun
        df_offline['weekday_hour_category'] = df_offline.apply(lambda row: f"weekend_{row['hour_category']}" if row['weekend'] == 1 else f"weekday_{row['hour_category']}", axis=1)

        # Calculate % of transactions per weekday/weekend category combinations for each user
        user_purchase_behaviour = df_offline.groupby(['client_id', 'weekday_hour_category']).size().unstack(fill_value=0)
        user_purchase_behaviour = user_purchase_behaviour.div(user_purchase_behaviour.sum(axis=1), axis=0)  # Normalize to %
        df_offline = df_offline.merge(user_purchase_behaviour.stack().reset_index(name='weekday_hour_category_freq'), on=['client_id', 'weekday_hour_category'], how='left')

        # Normalize features for typicality score
        df_offline['weekday_hour_category_freq_norm'] = (df_offline['weekday_hour_category_freq'] - df_offline['weekday_hour_category_freq'].min()) / (df_offline['weekday_hour_category_freq'].max() - df_offline['weekday_hour_category_freq'].min())
        df_offline['time_since_last_txn_norm'] = (df_offline['time_since_last_txn'] - df_offline['time_since_last_txn'].min()) / (df_offline['time_since_last_txn'].max() - df_offline['time_since_last_txn'].min())
        df_offline['amt_avg_ratio_norm'] = (df_offline['amt_avg_ratio'] - df_offline['amt_avg_ratio'].min()) / (df_offline['amt_avg_ratio'].max() - df_offline['amt_avg_ratio'].min())

        # Define weights for typicality score
        weights = {
            'weekday_hour_category_freq': 0.4,  # Frequency-based feature
            'amt_avg_ratio': 0.3,               # Transaction amount relative to average
            'time_since_last_txn': 0.2,         # Time since last transaction
            'client_fraud_rate': 0.1            # Historical fraud rate
        }

        # Calculate the weighted typicality score
        df_offline['typicality_score'] = (
            (1 - df_offline['weekday_hour_category_freq_norm']) * weights['weekday_hour_category_freq'] +
            df_offline['amt_avg_ratio_norm'] * weights['amt_avg_ratio'] +
            df_offline['time_since_last_txn_norm'] * weights['time_since_last_txn'] +
            df_offline['client_fraud_rate'] * weights['client_fraud_rate']
        )

        # Dummify hour category and weekday hour category
        df_offline = pd.get_dummies(df_offline, columns=['hour_category'], prefix='hour', dtype=int)
        df_offline = pd.get_dummies(df_offline, columns=['weekday_hour_category'], prefix='time_day', dtype=int)

        # Flag anomalous transactions for offline
        df_offline['suspicious_indiv_activity'] = 0
        for client_id, client_data in df_offline.groupby('client_id'):
            features = [
                'client_fraud_rate',
                'client_transaction_freq',
                'time_since_last_txn',
                'amount',
                'weekend',  # Include weekend feature
                'amt_income_ratio',
                'debt_income_ratio',
                'amt_avg_ratio',
                'typicality_score'
            ] + [col for col in df_offline.columns if col.startswith('hour_') or col.startswith('time_day_')]
            
            client_features = client_data[features]
            isoforest = IsolationForest(contamination=0.05, random_state=42)
            isoforest.fit(client_features)
            anomaly_prediction = isoforest.predict(client_features)
            df_offline.loc[client_data.index, 'suspicious_indiv_activity'] = (anomaly_prediction == -1).astype(int)

    # --- Combine Results ---
    df = pd.concat([df_online, df_offline], ignore_index=True)
    return df

df = flag_potential_fraud_indiv(df)

# Fill in missing values with 0
df.fillna(0, inplace=True)

df.head()

### **Start of Model: XGBoost**

Set up variables and split data

In [None]:
# Set up variables
df.rename(columns={'risk_score': 'merch_risk_score', 'total_transactions': 'merch_total_txn', 'avg_transaction_amt': 'avg_txn_merch'}, inplace=True)
X = df.drop(df.columns[[0,1,3,4,5,8,9,10,16,33,34,44,46,47,48]], axis=1)
y = df['Target']

# Stratified train test split
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, stratify=y, shuffle=True, random_state=42)

# Verify fraud ratio in original data and in train and test
print("Fraud ratio in original data: ", round(df['Target'].mean(), 3))
print("Fraud ratio in train data: ", round(y_train.mean(), 3))
print("Fraud ratio in test data: ", round(y_test.mean(), 3))

Fit base xgboost model

In [None]:
# Handle class imbalance in xgboost
#scale_pos_wt = (y == 0).sum() / (y == 1).sum()

# Initialise xgboost w scale pos wt
#model = xgb.XGBClassifier(scale_pos_weight=scale_pos_wt, random_state=42)

# Fit the model
#model.fit(X_train, y_train)

# Make predictions
#y_pred = model.predict(X_test)

# Evaluate model
#print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
#print('\nClassification Report:\n', classification_report(y_test, y_pred))
#print('\nROC AUC Score:', roc_auc_score(y_test, y_pred))

In [None]:
# Define oversampling and undersampling
over = SMOTE(sampling_strategy=0.2, random_state=42)  # Oversample fraud to 20%
under = RandomUnderSampler(sampling_strategy=0.3, random_state=42)  # Balance to 1:3 ratio

# Create pipeline
imbalance_pipeline = Pipeline(steps=[("over", over), ("under", under)])

# Apply resampling
X_resampled, y_resampled = imbalance_pipeline.fit_resample(X_train, y_train)

# Check the shape of the split
print("Train Set:", X_resampled.shape, y_resampled.shape)
print("Test Set:", X_test.shape, y_test.shape)

# Calculate scale_pos_weight directly
#scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
scale_pos_weight = (y_resampled == 0).sum() / (y_resampled == 1).sum()
print("scale_pos_weight:", scale_pos_weight)

# Train XGBoost with scale_pos_weight
model = XGBClassifier(
    scale_pos_weight=scale_pos_weight,  # Handle class imbalance
    random_state=42,
    eval_metric='logloss',  # Use logloss for binary classification
    use_label_encoder=False
)

# Train the model
model.fit(X_resampled, y_resampled)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

Plot feature importance and SHAP

In [None]:
xgb.plot_importance(model, importance_type='gain', max_num_features=15)
plt.show()

#explainer = shap.TreeExplainer(model)
#shap_values = explainer.shap_values(X_train)
#shap.summary_plot(shap_values, X_train, plot_type="bar")

#### Hyperparameter tuning

In [None]:
# Define parameter grid for tuning
param_grid = {
    'scale_pos_weight': [1, 2, 3, 5, 10, 20, 50, 100, 200],  # Adjust based on imbalance
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2]
}

# Define F2 score (beta=2 prioritizes recall)
f2_scorer = make_scorer(fbeta_score, beta=2)

# Initialize XGBoost classifier
xgb_clf = xgb.XGBClassifier(
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

# RandomizedSearchCV for hyperparameter tuning with resampling
def fit_model_with_resampling(X, y):
    # Apply resampling
    X_resampled, y_resampled = imbalance_pipeline.fit_resample(X, y)
    
    # Fit the model
    random_search = RandomizedSearchCV(
        estimator=xgb_clf,
        param_distributions=param_grid,
        n_iter=100,
        scoring=f2_scorer,
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    
    # Fit RandomizedSearchCV on resampled data
    random_search.fit(X_resampled, y_resampled)
    
    return random_search

# Fit the model using the training data
best_model = fit_model_with_resampling(X_train, y_train)

# Print best parameters
print("Best parameters found: ", best_model.best_params_)

Evaluate best model

In [None]:
# Evaluate on test set
y_pred = best_model.predict(X_test)

print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred))
print('\nROC AUC Score:', roc_auc_score(y_test, y_pred))

Plot feature importance and SHAP

In [None]:
xgb.plot_importance(best_model, importance_type='gain', max_num_features=15)
plt.show()

In [None]:
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")

Full code for xgboost tuning with ready prepared dataset

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import xgboost as xgb
from xgboost import XGBClassifier
#!pip install imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, f1_score, precision_score, recall_score

prepared = pd.read_csv('/Users/jiajue/Documents/McGill/Winter Term/INSY695/Group project/prepared_data.csv')
prepared.info()

X = prepared.drop(['Target'], axis=1)
y = prepared['Target']

# Stratified train test split
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, stratify=y, shuffle=True, random_state=42)

# Define oversampling and undersampling
over = SMOTE(sampling_strategy=0.2, random_state=42)  # Oversample fraud to 20%
under = RandomUnderSampler(sampling_strategy=0.3, random_state=42)  # Balance to 1:3 ratio

# Create pipeline for resampling
imbalance_pipeline = Pipeline(steps=[("over", over), ("under", under)])

# Apply resampling on the entire training set (just to check the shape of the dataset after resampling)
X_resampled, y_resampled = imbalance_pipeline.fit_resample(X_train, y_train)

# Check the shape of the split
print("Train Set:", X_resampled.shape, y_resampled.shape)
print("Test Set:", X_test.shape, y_test.shape)

# Stratified train-test split for the initial training set (using y_train for stratification)
X_train_tune, X_train_full, y_train_tune, y_train_full = train_test_split(
    X_train, y_train, test_size=0.3, stratify=y_train, random_state=42
)

# Calculate scale_pos_weight directly (for handling class imbalance)
scale_pos_weight = (y_resampled == 0).sum() / (y_resampled == 1).sum()
print("scale_pos_weight:", scale_pos_weight)

# Define parameter grid for tuning
param_grid = {
    'scale_pos_weight': [1, 2, 3, 5, 10, 20, 50, 100, 200],  # Adjust based on imbalance
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2]
}

# Define F2 score (beta=2 prioritizes recall)
f2_scorer = make_scorer(fbeta_score, beta=2)

# Initialize XGBoost classifier
xgb_clf = xgb.XGBClassifier(
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

# RandomizedSearchCV for hyperparameter tuning with resampling
def fit_model_with_resampling(X, y):
    # Apply resampling
    X_resampled, y_resampled = imbalance_pipeline.fit_resample(X, y)
    
    # Fit the model
    random_search = RandomizedSearchCV(
        estimator=xgb_clf,
        param_distributions=param_grid,
        n_iter=100,
        scoring=f2_scorer,
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    
    # Fit RandomizedSearchCV on resampled data
    random_search.fit(X_resampled, y_resampled)
    
    return random_search

# Perform hyperparameter tuning on a subset of the training data
best_model = fit_model_with_resampling(X_train_tune, y_train_tune)

# Print best parameters found
print("Best parameters found: ", best_model.best_params_)

# Now use the best model to train on the entire training data
best_xgb_model = best_model.best_estimator_

# Resample the full training data and fit the model
X_train_full_resampled, y_train_full_resampled = imbalance_pipeline.fit_resample(X_train_full, y_train_full)

# Train the final model on the full resampled training data
best_xgb_model.fit(X_train_full_resampled, y_train_full_resampled)

# Evaluate the model on the test set
y_pred = best_xgb_model.predict(X_test)
y_pred_proba = best_xgb_model.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

# Print classification report and AUC score
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

### CatBoost

In [None]:
#!pip install catboost
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

# Define oversampling and undersampling
over = SMOTE(sampling_strategy=0.2, random_state=42)  # Oversample fraud to 20%
under = RandomUnderSampler(sampling_strategy=0.3, random_state=42)  # Balance to 1:3 ratio

# Create pipeline
imbalance_pipeline = Pipeline(steps=[("over", over), ("under", under)])

# Apply resampling
X_resampled, y_resampled = imbalance_pipeline.fit_resample(X_train, y_train)

# Check the shape of the split
print("Train Set:", X_resampled.shape, y_resampled.shape)
print("Test Set:", X_test.shape, y_test.shape)

# Calculate scale_pos_weight directly
scale_pos_weight = (y_resampled == 0).sum() / (y_resampled == 1).sum()
print("scale_pos_weight:", scale_pos_weight)

# Train CatBoost with scale_pos_weight
model = CatBoostClassifier(
    scale_pos_weight=scale_pos_weight,  # Handle class imbalance
    random_state=42,
    loss_function='Logloss',  # Binary classification with logloss
    custom_metric=['AUC']  # For AUC scoring
)

# Train the model
model.fit(X_resampled, y_resampled, verbose=0)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

In [None]:
import joblib

# Save the model to a file using joblib
joblib.dump(model, 'catboost_model.joblib')

#### CatBoost tuning

In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

prepared = pd.read_csv('/Users/jiajue/Documents/McGill/Winter Term/INSY695/Group project/prepared_data.csv')
prepared.info()

X = prepared.drop(['Target'], axis=1)
y = prepared['Target']

# Stratified train test split
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, stratify=y, shuffle=True, random_state=42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8914963 entries, 0 to 8914962
Data columns (total 49 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   amount                       float64
 1   Target                       int64  
 2   current_age                  int64  
 3   retirement_age               int64  
 4   per_capita_income            float64
 5   yearly_income                float64
 6   total_debt                   float64
 7   credit_score                 int64  
 8   num_credit_cards             int64  
 9   use_chip_Chip Transaction    int64  
 10  use_chip_Online Transaction  int64  
 11  use_chip_Swipe Transaction   int64  
 12  merchant_state_CA            int64  
 13  merchant_state_FL            int64  
 14  merchant_state_IL            int64  
 15  merchant_state_MI            int64  
 16  merchant_state_NC            int64  
 17  merchant_state_NY            int64  
 18  merchant_state_OH            int64  
 19  

In [None]:
#!pip install catboost
#!pip install imbalanced-learn

# Define oversampling and undersampling
over = SMOTE(sampling_strategy=0.2, random_state=42)  # Oversample fraud to 20%
under = RandomUnderSampler(sampling_strategy=0.3, random_state=42)  # Balance to 1:3 ratio

# Create pipeline for resampling
imbalance_pipeline = Pipeline(steps=[("over", over), ("under", under)])

# Stratified train-test split for the initial training set (using y_train for stratification)
X_train_tune, X_train_full, y_train_tune, y_train_full = train_test_split(
    X_train, y_train, test_size=0.3, stratify=y_train, random_state=42
)

# Apply resampling on the entire training set (just to check the shape of the dataset after resampling)
X_resampled, y_resampled = imbalance_pipeline.fit_resample(X_train, y_train)

# Check the shape of the split
print("Train Set:", X_resampled.shape, y_resampled.shape)
print("Test Set:", X_test.shape, y_test.shape)

# Calculate scale_pos_weight directly (for handling class imbalance)
scale_pos_weight = (y_resampled == 0).sum() / (y_resampled == 1).sum()
print("scale_pos_weight:", scale_pos_weight)

# Define parameter grid for hyperparameter tuning (adjusted for CatBoost)
param_grid = {
    'depth': [3, 5, 7, 10],  # Max depth for CatBoost
    'min_data_in_leaf': [1, 3, 5],  # Minimum data in leaf
    'subsample': [0.6, 0.8, 1.0],  # Subsample ratio
    'colsample_bylevel': [0.6, 0.8, 1.0],  # Feature sampling ratio
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'iterations': [100, 150, 200, 250],  # Number of iterations
    'l2_leaf_reg': [1, 3, 5, 10],  # L2 regularization term
}

# Define F2 score (beta=2 prioritizes recall)
f2_scorer = make_scorer(fbeta_score, beta=2)

# Initialize CatBoostClassifier without custom_metric
catboost_clf = CatBoostClassifier(
    scale_pos_weight=scale_pos_weight,  # Initially set to 1.0 (you can adjust this later if needed)
    random_state=42,
    loss_function='Logloss',  # Binary classification with logloss
    verbose=100  # Monitor training progress
)

# Define a function for hyperparameter tuning
def fit_model_with_resampling(X, y):
    # Apply resampling on the training subset
    X_resampled, y_resampled = imbalance_pipeline.fit_resample(X, y)

    # Fit the model with RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=catboost_clf,
        param_distributions=param_grid,
        n_iter=30,  # Number of iterations for RandomizedSearchCV
        scoring=f2_scorer,
        cv=5,  # Cross-validation
        verbose=1,
        random_state=42,
        n_jobs=-1  # Use all available cores
    )

    # Fit RandomizedSearchCV on resampled data
    random_search.fit(X_resampled, y_resampled)

    return random_search

# Perform hyperparameter tuning on a subset of the training data
best_model = fit_model_with_resampling(X_train_tune, y_train_tune)

# Print best parameters found
print("Best parameters found: ", best_model.best_params_)

# Now use the best model to train on the entire training data
best_catboost_model = best_model.best_estimator_

# Resample the full training data and fit the model
X_train_full_resampled, y_train_full_resampled = imbalance_pipeline.fit_resample(X_train_full, y_train_full)

# Train the final model on the full resampled training data
best_catboost_model.fit(X_train_full_resampled, y_train_full_resampled)

# Evaluate the model on the test set
y_pred = best_catboost_model.predict(X_test)
y_pred_proba = best_catboost_model.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

# Print classification report and AUC score
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

Train Set: (6171793, 48) (6171793,)
Test Set: (1782993, 48) (1782993,)
scale_pos_weight: 3.3333330992936685
Fitting 5 folds for each of 30 candidates, totalling 150 fits
0:	learn: 0.5158076	total: 3.29s	remaining: 10m 55s
0:	learn: 0.5168739	total: 3.59s	remaining: 11m 54s
0:	learn: 0.5166381	total: 3.69s	remaining: 12m 13s
0:	learn: 0.4893234	total: 4.17s	remaining: 17m 18s
0:	learn: 0.4893206	total: 4.1s	remaining: 16m 59s
0:	learn: 0.4892260	total: 4.4s	remaining: 18m 16s
0:	learn: 0.4894937	total: 4.44s	remaining: 18m 26s
0:	learn: 0.4882477	total: 4.54s	remaining: 18m 51s
100:	learn: 0.0576329	total: 4m 28s	remaining: 4m 23s
100:	learn: 0.0615989	total: 4m 30s	remaining: 4m 25s
100:	learn: 0.0609439	total: 4m 35s	remaining: 4m 30s
100:	learn: 0.0416718	total: 5m 28s	remaining: 8m 4s
100:	learn: 0.0366333	total: 5m 30s	remaining: 8m 6s
100:	learn: 0.0426028	total: 5m 29s	remaining: 8m 6s
100:	learn: 0.0413751	total: 5m 30s	remaining: 8m 7s
100:	learn: 0.0405981	total: 5m 31s	remain



0:	learn: 0.5113630	total: 4.85s	remaining: 12m 3s
0:	learn: 0.5113522	total: 4.58s	remaining: 11m 22s
0:	learn: 0.6754248	total: 3.94s	remaining: 16m 21s
249:	learn: 0.0096046	total: 22m	remaining: 0us
0:	learn: 0.6755586	total: 3.62s	remaining: 15m
249:	learn: 0.0146750	total: 22m 2s	remaining: 0us
0:	learn: 0.6755756	total: 3.78s	remaining: 15m 41s
249:	learn: 0.0153801	total: 22m 7s	remaining: 0us
0:	learn: 0.6756094	total: 3.65s	remaining: 15m 9s
200:	learn: 0.0171688	total: 18m 36s	remaining: 4m 32s
200:	learn: 0.0171127	total: 18m 36s	remaining: 4m 32s
100:	learn: 0.1487789	total: 5m 51s	remaining: 8m 37s
100:	learn: 0.1609188	total: 5m 58s	remaining: 8m 48s
249:	learn: 0.0147726	total: 22m 24s	remaining: 0us
249:	learn: 0.0142924	total: 22m 23s	remaining: 0us
0:	learn: 0.5309515	total: 2.81s	remaining: 6m 59s
0:	learn: 0.6756075	total: 3.8s	remaining: 15m 46s
100:	learn: 0.0266323	total: 7m 55s	remaining: 3m 50s
100:	learn: 0.0260335	total: 7m 56s	remaining: 3m 51s
100:	learn: 