### Task 2: Model Building and Training
2.1 Data Preparation
Separate features and target variables.

Feature Engineering &
Perform train-test split.

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load data
fraud_data = pd.read_csv('../data/fraud_data.csv')

# Ensure 'purchase_time' is in datetime format
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Create 'hour_of_day' feature
fraud_data['hour_of_day'] = fraud_data['purchase_time'].dt.hour

# Sort data by user_id and purchase_time
fraud_data = fraud_data.sort_values(by=['user_id', 'purchase_time'])

# Static features for XGBoost
static_features = ['purchase_value', 'age', 'device_id', 'browser', 'source']
X_static = fraud_data[static_features]

# Group by user and create sequences for LSTM
sequential_data = fraud_data.groupby('user_id').apply(lambda x: x[['purchase_value', 'hour_of_day']].values.tolist())
X_sequential = sequential_data.tolist()

# Target variable
y = fraud_data['class'].values

# Train-test split
X_static_train, X_static_test, X_sequential_train, X_sequential_test, y_train, y_test = train_test_split(
    X_static, X_sequential, y, test_size=0.2, random_state=42
)

# Print shapes to verify
print(f"Static features shape: {X_static_train.shape}")
print(f"Sequential features shape: {len(X_sequential_train)} sequences")
print(f"Target variable shape: {y_train.shape}")

Static features shape: (120889, 5)
Sequential features shape: 120889 sequences
Target variable shape: (120889,)


  sequential_data = fraud_data.groupby('user_id').apply(lambda x: x[['purchase_value', 'hour_of_day']].values.tolist())


2.2Train XGBoost on Static Features

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn.metrics import roc_auc_score

# Load data
fraud_data = pd.read_csv('../data/fraud_data.csv')

# Reduce dataset size (use 10% of the data)
fraud_data = fraud_data.sample(frac=0.1, random_state=42)

# Ensure 'purchase_time' is in datetime format
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Create 'hour_of_day' feature
fraud_data['hour_of_day'] = fraud_data['purchase_time'].dt.hour

# Sort data by user_id and purchase_time
fraud_data = fraud_data.sort_values(by=['user_id', 'purchase_time'])

# Static features for XGBoost
static_features = ['purchase_value', 'age', 'device_id', 'browser', 'source']
X_static = fraud_data[static_features]

# Encode categorical columns using one-hot encoding
categorical_cols = ['device_id', 'browser', 'source']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_cols = encoder.fit_transform(X_static[categorical_cols])

# Combine encoded categorical columns with numerical columns
numerical_cols = ['purchase_value', 'age']
X_static_encoded = np.hstack([X_static[numerical_cols].values, encoded_cols])

# Target variable
y = fraud_data['class'].values

# Train-test split
X_static_train, X_static_test, y_train, y_test = train_test_split(
    X_static_encoded, y, test_size=0.2, random_state=42
)

# Print shapes to verify
print(f"Static features shape: {X_static_train.shape}")
print(f"Target variable shape: {y_train.shape}")

# Train XGBoost
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_static_train, y_train)

# Evaluate XGBoost
y_pred_xgb = xgb_model.predict_proba(X_static_test)[:, 1]
print(f"XGBoost AUC-ROC: {roc_auc_score(y_test, y_pred_xgb)}")

Static features shape: (12088, 14778)
Target variable shape: (12088,)
XGBoost AUC-ROC: 0.5935361087599402


### Train LSTM on Sequential Features

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np

# Convert input data to float32
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

# Calculate max_length for padding
max_length = max(len(seq) for seq in X_train)

# Pad sequences
X_train_padded = pad_sequences(X_train, maxlen=max_length, padding='post', dtype='float32')
X_test_padded = pad_sequences(X_test, maxlen=max_length, padding='post', dtype='float32')

# Build the model
input_layer = Input(shape=(max_length, X_train.shape[-1]))
masking_layer = Masking(mask_value=0.0)(input_layer)
lstm_layer = LSTM(64, return_sequences=False)(masking_layer)
dense_1 = Dense(32, activation='relu')(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dense_1)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train,
                   epochs=10,
                   batch_size=32,
                   validation_split=0.2,
                   verbose=1)

# Evaluate the model
y_pred = model.predict(X_test_padded)
auc_roc = roc_auc_score(y_test, y_pred)
print(f'AUC-ROC Score: {auc_roc}')

ModuleNotFoundError: No module named 'tensorflow.keras'

### Combine Outputs Using a Meta-Model

In [None]:
from sklearn.linear_model import LogisticRegression

# Combine predictions from XGBoost and LSTM
X_meta_train = np.column_stack((y_pred_xgb, y_pred_lstm.flatten()))

# Train meta-model (logistic regression)
meta_model = LogisticRegression()
meta_model.fit(X_meta_train, y_test)

# Evaluate hybrid model
y_pred_hybrid = meta_model.predict_proba(X_meta_train)[:, 1]
print(f"Hybrid Model AUC-ROC: {roc_auc_score(y_test, y_pred_hybrid)}")

Model Evaluation
Advanced Evaluation Metrics
We'll evaluate the model using:

Confusion Matrix

Precision, Recall, F1-Score

AUC-ROC

Precision-Recall Curve

In [None]:
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    precision_recall_curve,
    auc
)
import matplotlib.pyplot as plt

# Confusion Matrix
cm = confusion_matrix(y_test, (y_pred_xgb > 0.5).astype(int))
print("Confusion Matrix:")
print(cm)

# Classification Report
print("Classification Report:")
print(classification_report(y_test, (y_pred_xgb > 0.5).astype(int)))

# AUC-ROC
roc_auc = roc_auc_score(y_test, y_pred_xgb)
print(f"AUC-ROC: {roc_auc}")

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_xgb)
pr_auc = auc(recall, precision)
print(f"Precision-Recall AUC: {pr_auc}")

# Plot Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'Precision-Recall Curve (AUC = {pr_auc:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

 MLOps Implementation
Experiment Tracking with MLflow
We'll use MLflow to track experiments, log metrics, and save models.

In [None]:
import mlflow
import mlflow.sklearn

# Start an MLflow run
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("test_size", 0.2)
    
    # Log metrics
    mlflow.log_metric("AUC-ROC", roc_auc)
    mlflow.log_metric("Precision-Recall AUC", pr_auc)
    
    # Log the model
    mlflow.sklearn.log_model(xgb_model, "xgboost_model")
    
    # Log artifacts (e.g., plots)
    plt.savefig("precision_recall_curve.png")
    mlflow.log_artifact("precision_recall_curve.png")