In [1]:
# 02-Model-Training-9.ipynb (Deep Learning Model Training Notebook 2)

# ======================================================================
# CELL 1: Imports and Environment Checks
# ======================================================================
import pandas as pd
import numpy as np
import os

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers

# For data splitting and preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler

print("TensorFlow version:", tf.__version__)
print("Is GPU available?:", tf.config.list_physical_devices('GPU'))

TensorFlow version: 2.16.2
Is GPU available?: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# ======================================================================
# CELL 2: Load the synthetic data (Users, Enriched)
# ======================================================================
CSV_FILE = r"/Users/harshil/Development/personal_projects/VeriShield-ML-Experiments/data_generators/synthetic_users_enriched.csv"

if not os.path.exists(CSV_FILE):
    raise FileNotFoundError(f"Cannot find '{CSV_FILE}'. Please run the data generator first.")

df_users = pd.read_csv(CSV_FILE)
print("Data loaded. Shape:", df_users.shape)
print("Columns:", df_users.columns.tolist())

Data loaded. Shape: (1500000, 16)
Columns: ['user_id', 'name', 'email', 'username', 'birthdate', 'gender', 'signup_ip', 'device_id', 'phone', 'country_code', 'created_at', 'burst_signup', 'fraud_label', 'email_domain', 'ip_count', 'num_fraud_biz_owned']


In [3]:
# ======================================================================
# CELL 3: Inspect and handle missing data
# ======================================================================
print("\n=== Missing values per column ===\n", df_users.isnull().sum())

initial_len = len(df_users)
if 'fraud_label' not in df_users.columns:
    raise ValueError("Missing 'fraud_label' in data; cannot proceed.")

# Ensure we only keep rows with a known label
df_users.dropna(subset=['fraud_label'], inplace=True)
after_drop_label_len = len(df_users)

# Fill missing in typical string columns
string_cols = ['name', 'email', 'username', 'phone', 'country_code', 'email_domain']
for col in string_cols:
    if col in df_users.columns:
        df_users[col] = df_users[col].fillna("missing")

print(f"\nDropped {initial_len - after_drop_label_len} rows due to missing fraud_label.")
print("Now shape:", df_users.shape)
print("\n=== Missing after fill ===\n", df_users.isnull().sum())


=== Missing values per column ===
 user_id                    0
name                   29837
email                  29912
username                   0
birthdate                  0
gender                     0
signup_ip                  0
device_id                  0
phone                  30093
country_code           37455
created_at                 0
burst_signup               0
fraud_label                0
email_domain               0
ip_count                   0
num_fraud_biz_owned        0
dtype: int64

Dropped 0 rows due to missing fraud_label.
Now shape: (1500000, 16)

=== Missing after fill ===
 user_id                0
name                   0
email                  0
username               0
birthdate              0
gender                 0
signup_ip              0
device_id              0
phone                  0
country_code           0
created_at             0
burst_signup           0
fraud_label            0
email_domain           0
ip_count               0
num_fraud_biz_

In [4]:
# ======================================================================
# CELL 4: Feature Engineering
# ======================================================================
df = df_users.copy()

# 1) Convert 'gender' to 0(F)/1(M)
if 'gender' in df.columns:
    df['gender'] = df['gender'].map({'F': 0, 'M': 1}).fillna(0)

# 2) phone_len + phone_suspicious
df['phone_len'] = df['phone'].apply(len)
df['phone_suspicious'] = df['phone'].apply(
    lambda x: 1 if any(s in x for s in ["+999", "666-666"]) else 0
)

# 3) country_code_enc
if 'country_code' in df.columns:
    le_country = LabelEncoder()
    df['country_code_enc'] = le_country.fit_transform(df['country_code'].astype(str))
else:
    df['country_code_enc'] = 0

# 4) ip_private
if 'signup_ip' in df.columns:
    df['ip_private'] = df['signup_ip'].apply(
        lambda ip: 1 if ip.startswith("192.168") or ip.startswith("10.") else 0
    )
else:
    df['ip_private'] = 0

# 5) birth_year
def extract_year(date_str):
    try:
        return int(date_str.split("-")[0])
    except:
        return 1970
df['birth_year'] = df['birthdate'].apply(extract_year)

# 6) signup_dayofyear
df['signup_dayofyear'] = pd.to_datetime(df['created_at'], errors='coerce').dt.dayofyear.fillna(0)

# 7) email_domain_enc
if 'email_domain' in df.columns:
    le_domain = LabelEncoder()
    df['email_domain_enc'] = le_domain.fit_transform(df['email_domain'].astype(str))
else:
    df['email_domain_enc'] = 0

# 8) ip_count -> numeric
df['ip_count'] = df['ip_count'].fillna(0).astype(int)

# 9) num_fraud_biz_owned -> numeric
df['num_fraud_biz_owned'] = df['num_fraud_biz_owned'].fillna(0).astype(int)

# 10) optional device_id + burst_signup synergy (if you have them)
if 'device_id' in df.columns:
    # e.g., device collision count
    dev_counts = df.groupby('device_id')['user_id'].transform('count')
    df['device_collision'] = dev_counts
else:
    df['device_collision'] = 0

if 'burst_signup' in df.columns:
    df['burst_signup_flag'] = df['burst_signup'].astype(int)
else:
    df['burst_signup_flag'] = 0

df['synergy_burst_phone'] = df['phone_suspicious'] * df['burst_signup_flag']
df['multi_collision'] = ((df['ip_count'] > 2) & (df['device_collision'] > 1)).astype(int)

# Finalize feature set
target_col = 'fraud_label'
y = df[target_col].values

candidate_features = [
    'gender', 'phone_len', 'phone_suspicious', 'country_code_enc',
    'ip_private', 'birth_year', 'signup_dayofyear', 'email_domain_enc',
    'ip_count', 'num_fraud_biz_owned', 'device_collision',
    'burst_signup_flag', 'synergy_burst_phone', 'multi_collision'
]
feature_cols = [col for col in candidate_features if col in df.columns]
X = df[feature_cols].values

print("\nFeatures used:", feature_cols)
print("Feature matrix shape:", X.shape)
print("Target shape:", y.shape)


Features used: ['gender', 'phone_len', 'phone_suspicious', 'country_code_enc', 'ip_private', 'birth_year', 'signup_dayofyear', 'email_domain_enc', 'ip_count', 'num_fraud_biz_owned', 'device_collision', 'burst_signup_flag', 'synergy_burst_phone', 'multi_collision']
Feature matrix shape: (1500000, 14)
Target shape: (1500000,)


In [5]:
# ======================================================================
# CELL 5: Train/Test Split
# ======================================================================
test_size = 0.2
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=test_size,
    random_state=random_state,
    stratify=y
)

print("\nX_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape, "y_test shape:", y_test.shape)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Number of features:", X_train_scaled.shape[1])


X_train shape: (1200000, 14) y_train shape: (1200000,)
X_test shape: (300000, 14) y_test shape: (300000,)
Number of features: 14


In [6]:
# ======================================================================
# CELL 6: Imbalance Handling
# ======================================================================
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train_scaled, y_train)

print("\nOriginal training set shape:", X_train_scaled.shape, 
      f"Fraud ratio: {y_train.mean():.4f}")
print("Resampled training set shape:", X_train_res.shape, 
      f"Fraud ratio: {y_train_res.mean():.4f}")

# Alternatively, consider weighting or focal loss rather than oversampling.
print(X_train_res.shape[1])


Original training set shape: (1200000, 14) Fraud ratio: 0.3821
Resampled training set shape: (1482972, 14) Fraud ratio: 0.5000
14


In [7]:
# ======================================================================
# CELL 7: Define Model w/ Additional Regularization
# ======================================================================
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
# import tensorflow_addons as tfa

# Optionally: SigmoidFocalCrossEntropy for advanced imbalance
# focal_loss = tfa.losses.SigmoidFocalCrossEntropy(gamma=2.0)

model = keras.Sequential(name="FraudDetectionNet")

# Example: embedding-like approach if some columns are large cardinalities
# For brevity, we'll do a normal MLP
model.add(layers.Input(shape=(X_train_res.shape[1],)))

# Possibly add more advanced architecture
for i in range(4):
    model.add(layers.Dense(128, activation='relu', 
                           kernel_regularizer=regularizers.l2(1e-5)))
    model.add(layers.Dropout(0.3))

model.add(layers.Dense(1, activation='sigmoid'))

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    # loss=focal_loss,  # if you prefer focal
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

2025-01-18 18:10:54.508012: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2025-01-18 18:10:54.508041: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-01-18 18:10:54.508058: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-01-18 18:10:54.508080: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-01-18 18:10:54.508097: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
# ======================================================================
# CELL 8: Training Configuration
# ======================================================================
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-6
    )
]

epochs = 50
batch_size = 512

history = model.fit(
    X_train_res, y_train_res,
    validation_data=(X_test_scaled, y_test),
    epochs=epochs,
    batch_size=batch_size,
    verbose=1,
    callbacks=callbacks_list
)

print("\nTraining complete.")

Epoch 1/50


2025-01-18 18:11:15.726866: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m2897/2897[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 14ms/step - accuracy: 0.5579 - loss: 0.6967 - val_accuracy: 0.5684 - val_loss: 0.6832 - learning_rate: 0.0010
Epoch 2/50
[1m2897/2897[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 14ms/step - accuracy: 0.5741 - loss: 0.6843 - val_accuracy: 0.5705 - val_loss: 0.6772 - learning_rate: 0.0010
Epoch 3/50
[1m2897/2897[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 14ms/step - accuracy: 0.5742 - loss: 0.6834 - val_accuracy: 0.5682 - val_loss: 0.6913 - learning_rate: 0.0010
Epoch 4/50
[1m2897/2897[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 14ms/step - accuracy: 0.5754 - loss: 0.6827 - val_accuracy: 0.5684 - val_loss: 0.6804 - learning_rate: 0.0010
Epoch 5/50
[1m2897/2897[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 14ms/step - accuracy: 0.5770 - loss: 0.6819 - val_accuracy: 0.5684 - val_loss: 0.6812 - learning_rate: 5.0000e-04

Training complete.


In [9]:
# ======================================================================
# CELL 9: Evaluation
# ======================================================================
# Evaluate final model on test set
test_loss, test_acc = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

# Predict probabilities
y_pred_prob = model.predict(X_test_scaled, batch_size=batch_size).ravel()

# 9.1 Evaluate multiple thresholds
def evaluate_threshold(thresh):
    y_pred_t = (y_pred_prob >= thresh).astype(int)
    acc = (y_pred_t == y_test).mean()
    precision = 0
    recall = 0
    try:
        from sklearn.metrics import precision_score, recall_score, f1_score
        precision = precision_score(y_test, y_pred_t, zero_division=0)
        recall = recall_score(y_test, y_pred_t, zero_division=0)
        f1 = f1_score(y_test, y_pred_t, zero_division=0)
    except:
        f1 = 0

    print(f"\nThreshold={thresh:.2f} => Accuracy={acc:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, F1={f1:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_t))

for thresh in [0.3, 0.4, 0.5, 0.6, 0.7]:
    evaluate_threshold(thresh)

# 9.2: Additional metrics
ap_score = average_precision_score(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print(f"\nAverage Precision (AUC-PR) on test set: {ap_score:.4f}")
print(f"ROC AUC on test set: {roc_auc:.4f}")

# Summaries
final_epoch = len(history.history['loss'])
best_val_idx = np.argmin(history.history['val_loss'])
print(f"\nTraining ended after {final_epoch} total epochs. Best val_loss epoch: {best_val_idx + 1}")
print(f"Final Train Loss: {history.history['loss'][-1]:.4f} | Final Val Loss: {history.history['val_loss'][-1]:.4f}")


Test Loss: 0.6772, Test Accuracy: 0.5705
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

Threshold=0.30 => Accuracy=0.3821, Precision=0.3821, Recall=1.0000, F1=0.5529
Confusion Matrix:
 [[     0 185371]
 [     0 114629]]

Threshold=0.40 => Accuracy=0.4030, Precision=0.3864, Recall=0.9560, F1=0.5503
Confusion Matrix:
 [[ 11312 174059]
 [  5042 109587]]

Threshold=0.50 => Accuracy=0.5705, Precision=0.4521, Recall=0.5853, F1=0.5102
Confusion Matrix:
 [[104062  81309]
 [ 47533  67096]]

Threshold=0.60 => Accuracy=0.6225, Precision=0.5944, Recall=0.0375, F1=0.0706
Confusion Matrix:
 [[182434   2937]
 [110325   4304]]

Threshold=0.70 => Accuracy=0.6179, Precision=0.0000, Recall=0.0000, F1=0.0000
Confusion Matrix:
 [[185371      0]
 [114629      0]]

Average Precision (AUC-PR) on test set: 0.4505
ROC AUC on test set: 0.5812

Training ended after 5 total epochs. Best val_loss epoch: 2
Final Train Loss: 0.6819 | Final Val Loss: 0.6812
