## Model Inference

In this notebook, the model will try to predict the category of a new unseen transaction, given all required features (e.g. date, amount, description, et cetera.) as inputs to the model.

#### Load and Preprocess the New Inference Dataset

In [121]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Import the text cleaning function from the util folder
from util.text_cleaning import clean_normalize_text

# Load the raw training dataset
df = pd.read_csv("../dataset/clean_embedding_bank_transaction.csv")

# Load the new dataset for inference
new_df = pd.read_csv("../dataset/inference_bank_transaction.csv")

# Load the FastText model
from gensim.models import FastText
fasttext_model = FastText.load("../models/fasttext_model.bin")

In [122]:
def preprocess_new_data(df, fasttext_model, scaler, structured_features, category_columns):
    """Preprocess the new dataset using the same pipeline as training."""
    
    # Drop unnecessary columns (if exist)
    df = df.drop(columns=['client_id', 'bank_id', 'account_id', 'txn_id'], errors='ignore')

    # Convert txn_date to datetime format (if not already in datetime)
    if 'txn_date' in df.columns:
        df['txn_date'] = pd.to_datetime(df['txn_date'], errors='coerce')

        # Extract time-based features
        df['day_of_week'] = df['txn_date'].dt.dayofweek  # Monday=0, Sunday=6
        df['day_of_month'] = df['txn_date'].dt.day  # 1-31
        df['hour'] = df['txn_date'].dt.hour  # Extract hour from transaction time (0-23)
        df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)  # 1=Weekend, 0=Weekday

        # Drop original txn_date column
        df = df.drop(columns=['txn_date'], errors='ignore')

    # Ensure transaction descriptions are cleaned first
    if 'description' in df.columns:
        df['processed_description'] = df['description'].fillna('').apply(clean_normalize_text)
    else:
        raise KeyError("Column 'description' is missing in the dataset!")

    # One-hot encode the category column (if it exists)
    if 'category' in df.columns:
        df = pd.get_dummies(df, columns=['category'], prefix='category')
    else:
        raise KeyError("Column 'category' is missing in the dataset!")

    # Ensure all category columns exist (if missing, add as zeros)
    for col in category_columns:
        if col not in df:
            df[col] = 0  # Add missing category columns as zeros

    # Generate FastText embeddings from cleaned transaction descriptions
    def get_embedding(text):
        words = str(text).split()  # Convert text to words
        word_vectors = [fasttext_model.wv[word] for word in words if word in fasttext_model.wv]
        if len(word_vectors) == 0:
            return np.zeros(100, dtype=np.float32)  # Ensure float32 dtype
        return np.mean(word_vectors, axis=0).astype(np.float32)  # Ensure float32 dtype

    df['fasttext_embedding']= df['processed_description'].apply(get_embedding)
    fasttext_embeddings = np.vstack(df['fasttext_embedding'].values).astype(np.float32)  # Ensure float32

    # Drop raw text columns
    df = df.drop(columns=['description', 'processed_description', 'fasttext_embedding'], errors='ignore')

    # Normalize numerical structured features
    df[structured_features] = scaler.transform(df[structured_features])  
    
    # Reorder category columns to match training order
    df = df[structured_features + category_columns]  

    # **Convert Everything to Float32** to avoid NumPy object dtype issue
    df[structured_features] = df[structured_features].astype(np.float32)
    df[category_columns] = df[category_columns].astype(np.float32)

    # Combine structured features, one-hot category encoding, and FastText embeddings into final feature matrix
    X_new = np.hstack((df[structured_features].values, fasttext_embeddings, df[category_columns].values)).astype(np.float32)
    
    return X_new

In [123]:
# Extract structured numerical features (same as training)
structured_features = ['amount', 'is_interested_investment', 'is_interested_build_credit', 'is_interested_increase_income', 'is_interested_pay_off_debt', 'is_interested_manage_spending', 'is_interested_grow_savings', 'day_of_week', 'day_of_month', 'hour', 'is_weekend']

# Extract category column names from training dataset (ensuring consistent one-hot encoding)
category_columns = [col for col in df.columns if col.startswith("category_")]

# Create standard scaler based on training data
scaler = StandardScaler()
scaler.fit(df[structured_features].values)

# Preprocess new dataset
X_new = preprocess_new_data(new_df, fasttext_model, scaler, structured_features, category_columns)

# Convert to PyTorch tensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_new_tensor = torch.tensor(X_new, dtype=torch.float32).to(device)

# Print shape
print("New dataset shape after preprocessing:", X_new_tensor.shape)

# Split the input into features (first 111 columns) and ground truth (last 33 columns)
X_features = X_new_tensor[:, :111]  # Input to the model
y_true = X_new_tensor[:, 111:]  # Ground truth labels (last 33 columns)

print("Features shape:", X_features.shape)
print("Ground truth shape:", y_true.shape)

  df['txn_date'] = pd.to_datetime(df['txn_date'], errors='coerce')


New dataset shape after preprocessing: torch.Size([115, 144])
Features shape: torch.Size([115, 111])
Ground truth shape: torch.Size([115, 33])




#### Load the Trained Model

In [124]:
import torch
import torch.nn as nn
from model import TransactionClassifier

# Load the trained model 
model_path = "../models/ANN_20e_1e-3lr_4l_classifier.pth" 

# Initialize the model
model = TransactionClassifier(X_features.shape[1], y_true.shape[1])

model.load_state_dict(torch.load(model_path))
model.to(device)
model.eval()  # Set model to evaluation mode

  model.load_state_dict(torch.load(model_path))


TransactionClassifier(
  (fc1): Linear(in_features=111, out_features=256, bias=True)
  (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU()
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU()
  (dropout2): Dropout(p=0.3, inplace=False)
  (fc3): Linear(in_features=128, out_features=33, bias=True)
  (sigmoid): Sigmoid()
)

In [125]:
# Perform inference
with torch.no_grad():
    outputs = model(X_features)  # Get model predictions
    predicted_categories = (outputs > 0.5).float()  # Convert logits to binary (threshold = 0.5)

# Convert one-hot predictions & ground truth back to category labels
predicted_labels = []
true_labels = []

for pred, true_label in zip(predicted_categories.cpu().numpy(), y_true.cpu().numpy()):  
    predicted_index = pred.argmax()  # Get index of the highest probability category
    true_index = true_label.argmax()  # Get index of the actual category

    predicted_labels.append(category_columns[predicted_index])  # Map index to category column
    true_labels.append(category_columns[true_index])  # Map ground truth index to category column

# Print the results
print("\n===== Model Predictions vs Ground Truth =====")
correct_count = 0

for i, (original, predicted, actual) in enumerate(zip(new_df['description'], predicted_labels, true_labels)):
    is_correct = "✅ Correct" if predicted == actual else "❌ Incorrect"
    if predicted == actual:
        correct_count += 1

    print(f"Transaction {i+1}:")
    print(f"  Description: {original}")
    print(f"  Predicted Category: {predicted}")
    print(f"  Ground Truth: {actual}")
    print(f"  Result: {is_correct}")
    print("-" * 50)

# Calculate overall accuracy
accuracy = correct_count / len(predicted_labels) * 100
print(f"✅ Model Accuracy on New Transactions: {accuracy:.2f}%")


===== Model Predictions vs Ground Truth =====
Transaction 1:
  Description: Earnin           PAYMENT                 Donatas Danyal
  Predicted Category: category_ATM
  Ground Truth: category_Loans
  Result: ❌ Incorrect
--------------------------------------------------
Transaction 2:
  Description: ONLINE TRANSFER FROM NDonatas DanyalDA O CARSON BUSINESS CHECKING 1216 1216
  Predicted Category: category_ATM
  Ground Truth: category_Transfer Credit
  Result: ❌ Incorrect
--------------------------------------------------
Transaction 3:
  Description: MONEY TRANSFER                          AUTHORIZED ON   09/25 FROM Earnin CDAEJ_B                     CA  S583269001208168   111
  Predicted Category: category_ATM
  Ground Truth: category_Loans
  Result: ❌ Incorrect
--------------------------------------------------
Transaction 4:
  Description: ONLINE TRANSFER FROM CARSON N EVERYDAY CHECKING 1216 1216
  Predicted Category: category_Transfer Credit
  Ground Truth: category_Transfer Credit