## Geolocation Fraud Detection Model

In [6]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('data/geolocation_fraud_dataset.csv')
print(df.info())
print(df.describe())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 28 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   transaction_id           200000 non-null  int64  
 1   timestamp                200000 non-null  object 
 2   amount                   200000 non-null  float64
 3   currency                 200000 non-null  object 
 4   transaction_type         200000 non-null  object 
 5   country                  198987 non-null  object 
 6   city                     200000 non-null  object 
 7   latitude                 179906 non-null  float64
 8   longitude                179951 non-null  float64
 9   ip_address               200000 non-null  object 
 10  timezone                 200000 non-null  object 
 11  user_id                  200000 non-null  int64  
 12  device_id                189924 non-null  object 
 13  device_type              200000 non-null  object 
 14  oper

#### Data Preprocessing

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
)
import shap
import joblib
import shap
import os
from imblearn.metrics import geometric_mean_score

ImportError: Numba needs NumPy 2.1 or less. Got NumPy 2.2.

These fields are either:
- Not available during real-time inference (e.g., transaction_id)
- Too high-cardinality for tree models (ip_address, user_agent)
- Not adding predictive value or cause input bloat in Triton

#### Label and Feature Separation

In [13]:
target = "is_fraud"
X = df.drop(columns=[target])
y = df[target]

#### Imputation and Encoding

In [14]:
from sklearn.preprocessing import OrdinalEncoder

categorical_cols = [
    "currency",
    "transaction_type",
    "country",
    "device_type",
    "operating_system",
    "app_version",
    "timezone",
    "ISP",
]

ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X[categorical_cols] = ordinal_encoder.fit_transform(X[categorical_cols])

1. Label/Ordinal Encoding is better for XGBoost, which handles category splits natively.
2. One-hot encoding causes dimensionality explosion, unnecessary with tree models.
3. Triton expects fixed input shape, so we avoid sparse/dynamic one-hot formats.

In [15]:
from sklearn.impute import SimpleImputer

numeric_cols = [
    "amount",
    "avg_spend_30d",
    "transactions_last_7d",
    "time_since_last_login",
    "login_attempts_last_24h",
    "ip_risk_score",
    "latitude",
    "longitude",
]

imputer = SimpleImputer(strategy="median")
X[numeric_cols] = imputer.fit_transform(X[numeric_cols])

- Median imputation is robust to outliers and common for fraud datasets (due to skewed features like amount).
- Keeps the numerical shape consistent for Triton inference.

#### Train-Test Split

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

#### Model Training

In [20]:
from xgboost import XGBClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
from imblearn.metrics import geometric_mean_score
import matplotlib.pyplot as plt

scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)

model = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=10,
    max_depth=6,
    learning_rate=0.05,
    n_estimators=500,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    use_label_encoder=False,
)

model.fit(
    X_train,
    y_train,
    verbose=True,
)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:city: object