# Model Implementation

### Introduction

In [6]:
# Importing The Libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [13]:
# Importing The Clean and Encoded Data
try:
    data = pd.read_csv('credit_card_cleaned.csv')
except:
    print('Error while loading the file')
    
data.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,merch_zipcode,customer_id_str,customer_id,trans_date,trans_time,hour,time_bin,day_of_week,age,age_bin
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,...,28705.0,"jennifer_banks_f_psychologist, counselling_mor...",0,2019-01-01,1900-01-01 00:00:18,0,00:00-02:00,Tuesday,37,30-39
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,...,,stephanie_gill_f_special educational needs tea...,1,2019-01-01,1900-01-01 00:00:44,0,00:00-02:00,Tuesday,47,40-49
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,...,83236.0,edward_sanchez_m_nature conservation officer_m...,2,2019-01-01,1900-01-01 00:00:51,0,00:00-02:00,Tuesday,63,60-69
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,...,,jeremy_white_m_patent attorney_boulder,3,2019-01-01,1900-01-01 00:01:16,0,00:00-02:00,Tuesday,58,50-59
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,...,22844.0,tyler_garcia_m_dance movement psychotherapist_...,4,2019-01-01,1900-01-01 00:03:06,0,00:00-02:00,Tuesday,39,30-39


In [14]:
data.columns

Index(['cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender',
       'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',
       'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'merch_zipcode', 'customer_id_str', 'customer_id', 'trans_date',
       'trans_time', 'hour', 'time_bin', 'day_of_week', 'age', 'age_bin'],
      dtype='object')

### Train Test Split

In [15]:
# Features
X = data.drop("is_fraud", axis=1)

# Target
y = data["is_fraud"]

# Train-test split (80/20, stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [16]:
# Check distribution in the original dataset
print("Original dataset:")
print(y.value_counts(normalize=True) * 100)

# Check distribution in training set
print("\nTraining set:")
print(y_train.value_counts(normalize=True) * 100)

# Check distribution in test set
print("\nTest set:")
print(y_test.value_counts(normalize=True) * 100)

Original dataset:
is_fraud
0    99.421135
1     0.578865
Name: proportion, dtype: float64

Training set:
is_fraud
0    99.421116
1     0.578884
Name: proportion, dtype: float64

Test set:
is_fraud
0    99.421212
1     0.578788
Name: proportion, dtype: float64


The dataset was split into training and test sets using an 80/20 ratio with stratification on the target variable (`is_fraud`).  
This ensures that the class distribution (fraud ≈ 0.58%, non-fraud ≈ 99.42%) is preserved across both sets, preventing bias during model training and evaluation.

### Features

X_train

y_train

'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender',
       'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',
       'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'merch_zipcode', 'customer_id_str', 'customer_id', 'trans_date',
       'trans_time', 'hour', 'time_bin', 'day_of_week', 'age', 'age_bin'

**Transaction Amount**

- Median of Transaction Amount
- STD of Transaction Amount
- Ratio of Transaction Amount (amount/median)

In [None]:
customer_stats_train = X_train.groupby("customer_id")["amt"].agg(
    median_amount="median",
    std_amount="std"
).fillna(0).reset_index()

X_train = X_train.merge(customer_stats_train, on="customer_id", how="left")

# creating ratio of transaction amount to median
X_train["amt_ratio_to_median"] = X_train["amt"] / X_train["median_amount"]

In [None]:
X_test = X_test.merge(customer_stats_train, on="customer_id", how="left")

X_test["median_amount"].fillna(X_train["median_amount"].median(), inplace=True)
X_test["std_amount"].fillna(X_train["std_amount"].median(), inplace=True)

X_test["amt_ratio_to_median"] = X_test["amt"] / X_test["median_amount"]

In [None]:
X_train[["median_amount", "std_amount", "amt_ratio_to_median"]].head()

**Transaction Time**

- Median of Transaction Time per Bin
- STD of Transaction Time per Bin
- Fraction of transactions per time interval

Binning - 2-Hour Time Intervals.

The reason for using 2-hour intervals is based on the EDA visualization, which showed that the time intervals with the most fraud cases are 22:00–24:00, 00:00–02:00, and 02:00–04:00. To preserve this information, I chose to keep the 2-hour time bins.

In [None]:
bins = list(range(0, 25, 2))  # [0,2,4,...,24] - 2-Hour Time Intervals
labels = [f"{h:02d}:00-{h+2:02d}:00" for h in bins[:-1]]

# Create the 'time_bin' column for X_train
X_train["time_bin"] = pd.cut(X_train["hour"], bins=bins, labels=labels, right=False)

# Ordinal encoding: map each label to a number
time_bin_mapping = {label: idx for idx, label in enumerate(labels)}
X_train["time_bin_encoded"] = X_train["time_bin"].map(time_bin_mapping)

In [None]:
X_test["time_bin"] = pd.cut(X_test["hour"], bins=bins, labels=labels, right=False)
X_test["time_bin_encoded"] = X_test["time_bin"].map(time_bin_mapping)

In [None]:
cust_bin_stats = X_train.groupby(["customer_id", "time_bin_encoded"])["amt"].agg(
    median_amt="median",
    std_amt="std"
).unstack(fill_value=0)

cust_bin_stats.columns = [f"{stat}_{bin}" for stat, bin in cust_bin_stats.columns]

X_train = X_train.merge(cust_bin_stats, left_on="customer_id", right_index=True, how="left")

In [None]:
X_test = X_test.merge(cust_bin_stats, left_on="customer_id", right_index=True, how="left")

X_test.fillna(0, inplace=True) 

In [None]:
# Compute count per customer per time bin
cust_bin_counts = X_train.groupby(["customer_id", "time_bin_encoded"]).size().unstack(fill_value=0)

cust_bin_frac = cust_bin_counts.div(cust_bin_counts.sum(axis=1), axis=0)

X_train = X_train.merge(cust_bin_frac, left_on="customer_id", right_index=True, how="left", suffixes=('', '_frac'))

In [None]:
X_test = X_test.merge(cust_bin_frac, left_on="customer_id", right_index=True, how="left")

X_test.fillna(0, inplace=True) 

I will test the features later and drop any that are unnecessary to reduce the model’s cardinality and keep it simple.

**Day of the Week**

- Median Transaction Amount per Day of the Week per Customer
- STD Transaction Amount per Day of the Week per Customer
- Frequency of Spending per Day per Customer

Ordinal Encoding

In [None]:
# Map days to numbers (Monday=0, ..., Sunday=6)
day_mapping = {"Monday":0, "Tuesday":1, "Wednesday":2, "Thursday":3, "Friday":4, "Saturday":5, "Sunday":6}
X_train["day_encoded"] = X_train["day_of_week"].map(day_mapping)
X_test["day_encoded"] = X_test["day_of_week"].map(day_mapping)

In [None]:
cust_day_stats = X_train.groupby(["customer_id", "day_of_week"])["amt"].agg(
    median_amt="median",
    std_amt="std"
).unstack(fill_value=0)

cust_day_stats.columns = [f"{stat}_{day}" for stat, day in cust_day_stats.columns]

cust_day_counts = X_train.groupby(["customer_id", "day_of_week"]).size().unstack(fill_value=0)
cust_day_frac = cust_day_counts.div(cust_day_counts.sum(axis=1), axis=0)
cust_day_frac.columns = [f"frac_{day}" for day in cust_day_frac.columns]

X_train = X_train.merge(cust_day_stats, left_on="customer_id", right_index=True, how="left")
X_train = X_train.merge(cust_day_frac, left_on="customer_id", right_index=True, how="left")

X_test = X_test.merge(cust_day_stats, left_on="customer_id", right_index=True, how="left")
X_test = X_test.merge(cust_day_frac, left_on="customer_id", right_index=True, how="left")

X_test.fillna(0, inplace=True)

**Age**

- Relative frequency per age bin
- Median transaction amount per age bin

In [None]:
age_bin_counts_train = X_train.groupby("age_bin")["trans_num"].count()
total_transactions_train = len(X_train)
age_bin_rel_freq_train = (age_bin_counts_train / total_transactions_train).reset_index(name="age_bin_rel_freq")

X_train = X_train.merge(age_bin_rel_freq_train, on="age_bin", how="left")

X_test = X_test.merge(age_bin_rel_freq_train, on="age_bin", how="left")

age_bin_median_train = X_train.groupby("age_bin")["amt"].median().reset_index(name="age_bin_median_amt")

X_train = X_train.merge(age_bin_median_train, on="age_bin", how="left")

X_test = X_test.merge(age_bin_median_train, on="age_bin", how="left")

**Categories**

- Transaction counts per category per customer
- Fraction of transactions per category
- Median / std transaction amount per category

In [21]:
print(f"Number of distinct categories in the data set: {len(X_train["category"].unique())}")

Number of distinct categories in the data set: 14


In [None]:
cust_cat_counts = X_train.groupby(["customer_id", "category"]).size().unstack(fill_value=0)
X_train = X_train.merge(cust_cat_counts, on="customer_id", how="left")
X_test = X_test.merge(cust_cat_counts, on="customer_id", how="left")

cust_cat_frac = cust_cat_counts.div(cust_cat_counts.sum(axis=1), axis=0).reset_index()
X_train = X_train.merge(cust_cat_frac, on="customer_id", how="left", suffixes=('', '_frac'))
X_test = X_test.merge(cust_cat_frac, on="customer_id", how="left", suffixes=('', '_frac'))

cat_stats = X_train.groupby(["customer_id", "category"])["amt"].agg(
    median_amt="median",
    std_amt="std"
).unstack(fill_value=0)
X_train = X_train.merge(cat_stats, left_on="customer_id", right_index=True, how="left")
X_test = X_test.merge(cat_stats, left_on="customer_id", right_index=True, how="left")

**City Population**

- Average transaction amount per customer in small vs large cities
- Fraction of transactions in high-population (large) cities

In [None]:
# Example threshold for small vs large city (adjust as needed)
threshold = data["city_pop"].median()  # median population

# Create a city size column
X_train["city_size"] = X_train["city_pop"].apply(lambda x: "small" if x <= threshold else "large")

# --- Average transaction amount per customer in small vs large cities ---
cust_city_avg = X_train.groupby(["customer_id", "city_size"])["amt"].mean().unstack(fill_value=0)
cust_city_avg.columns = [f"avg_amt_{size}_city" for size in cust_city_avg.columns]

# Merge into X_train
X_train = X_train.merge(cust_city_avg, left_on="customer_id", right_index=True, how="left")

# --- Fraction of transactions in high-population (large) cities ---
cust_city_counts = X_train.groupby(["customer_id", "city_size"]).size().unstack(fill_value=0)
cust_city_frac = cust_city_counts.div(cust_city_counts.sum(axis=1), axis=0)
cust_city_frac.columns = [f"frac_txn_{size}_city" for size in cust_city_frac.columns]

# Merge fraction features into X_train
X_train = X_train.merge(cust_city_frac, left_on="customer_id", right_index=True, how="left")

**States**

- Median amount per state
- Std amount per state
- Fraction of transactions per state

One-Hot Encoding

In [24]:
print(f"Number of distinct states in the data set: {len(X_train["state"].unique())}")

Number of distinct states in the data set: 51


In [None]:
# One-hot encode states for X_train
state_dummies = pd.get_dummies(X_train["state"], prefix="state")
X_train = pd.concat([X_train, state_dummies], axis=1)

# --- Customer-level aggregated features per state ---
# Median spending per state
cust_state_median = X_train.groupby(["customer_id", "state"])["amt"].median().unstack(fill_value=0)
cust_state_median.columns = [f"median_amt_{s}" for s in cust_state_median.columns]

# Std spending per state
cust_state_std = X_train.groupby(["customer_id", "state"])["amt"].std().fillna(0).unstack(fill_value=0)
cust_state_std.columns = [f"std_amt_{s}" for s in cust_state_std.columns]

# Fraction of transactions per state
cust_state_counts = X_train.groupby(["customer_id", "state"]).size().unstack(fill_value=0)
cust_state_frac = cust_state_counts.div(cust_state_counts.sum(axis=1), axis=0)
cust_state_frac.columns = [f"frac_txn_{s}" for s in cust_state_frac.columns]

# Merge all features into X_train
X_train = X_train.merge(cust_state_median, left_on="customer_id", right_index=True, how="left")
X_train = X_train.merge(cust_state_std, left_on="customer_id", right_index=True, how="left")
X_train = X_train.merge(cust_state_frac, left_on="customer_id", right_index=True, how="left")


**Merchants**

In [None]:
print(f"Number of distinct merchants in the data set: {len(X_train["merchant"].unique())}")

Number of distinct merchants data set: 693


**Jobs**

In [22]:
print(f"Number of distinct jobs in the data set: {len(X_train["job"].unique())}")

Number of distinct jobs in the data set: 494


**Cities**

In [23]:
print(f"Number of distinct cities in the data set: {len(X_train["city"].unique())}")

Number of distinct cities in the data set: 894


Merchants, Jobs, Cities all have a high number of unqiue values thus in order to keep the model simple and focus on higher signal features I'll drop these three as features. 