In [1]:
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import log_loss, roc_auc_score
import pandas as pd
import os

path = "C:/Users/ericw/OneDrive/桌面"
os.chdir(path)
data = pd.read_csv("df_final.csv")

In [2]:
# Adjusting the target variable
data['y_binary'] = (data['product_action_pageview_purchase'] >= 1).astype(int)

# Identifying feature types
dense_features = ['number_of_seen_url', 'product_action_event_add', 'product_action_event_click', 'product_action_event_remove', 'product_action_pageview_detail', 'engagement_duration']
sparse_features = ['product_skus_hash', 'day_of_week', 'hour_of_first_interaction', 'hour_of_last_interaction']

# Label encode sparse features
from sklearn.preprocessing import LabelEncoder

for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

# Normalize dense features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[dense_features] = scaler.fit_transform(data[dense_features])

# Prepare features for DeepCTR
X = data[sparse_features + dense_features]
y_binary = data['y_binary']

In [3]:
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Define the feature columns for DeepFM
sparse_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
                          for i, feat in enumerate(sparse_features)]
dense_feature_columns = [DenseFeat(feat, 1,)
                         for feat in dense_features]

feature_columns = sparse_feature_columns + dense_feature_columns
feature_names = get_feature_names(feature_columns)

# Convert the dataset into a format that can be fed into DeepFM
train_model_input = {name: X_train[name] for name in feature_names}
test_model_input = {name: X_test[name] for name in feature_names}

# Build, compile, and train the model
model = DeepFM(feature_columns, feature_columns, task='binary')
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(train_model_input, y_train, batch_size=256, epochs=10, verbose=2, validation_split=0.2)

# Evaluate the model
eval_result = model.evaluate(test_model_input, y_test, batch_size=256)
print("\nTest loss:", eval_result[0])
print("Test accuracy:", eval_result[1])

Epoch 1/10
3802/3802 - 10s - loss: 0.0036 - accuracy: 0.9991 - val_loss: 0.0010 - val_accuracy: 0.9998
Epoch 2/10
3802/3802 - 9s - loss: 3.7927e-04 - accuracy: 0.9999 - val_loss: 6.4029e-04 - val_accuracy: 0.9999
Epoch 3/10
3802/3802 - 9s - loss: 2.8941e-04 - accuracy: 0.9999 - val_loss: 5.1224e-04 - val_accuracy: 0.9999
Epoch 4/10
3802/3802 - 9s - loss: 2.4373e-04 - accuracy: 0.9999 - val_loss: 4.8743e-04 - val_accuracy: 0.9999
Epoch 5/10
3802/3802 - 9s - loss: 2.1372e-04 - accuracy: 0.9999 - val_loss: 3.3448e-04 - val_accuracy: 0.9999
Epoch 6/10
3802/3802 - 9s - loss: 2.4285e-04 - accuracy: 0.9999 - val_loss: 3.9535e-04 - val_accuracy: 0.9999
Epoch 7/10
3802/3802 - 9s - loss: 2.0201e-04 - accuracy: 0.9999 - val_loss: 5.4732e-04 - val_accuracy: 0.9999
Epoch 8/10
3802/3802 - 9s - loss: 1.7024e-04 - accuracy: 1.0000 - val_loss: 0.0010 - val_accuracy: 0.9999
Epoch 9/10
3802/3802 - 9s - loss: 1.6332e-04 - accuracy: 1.0000 - val_loss: 4.8747e-04 - val_accuracy: 0.9999
Epoch 10/10
3802/3802

In [4]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

y_pred_probs = model.predict(test_model_input, batch_size=256)
# Convert probabilities to binary predictions based on a 0.5 threshold
y_pred = (y_pred_probs > 0.5).astype("int32")

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate precision, recall, F1-score, and support
clf_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(clf_report)


Confusion Matrix:
[[302217     20]
 [     8   1902]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    302237
           1       0.99      1.00      0.99      1910

    accuracy                           1.00    304147
   macro avg       0.99      1.00      1.00    304147
weighted avg       1.00      1.00      1.00    304147



In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

# Define the K-fold Cross Validator
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize lists to store results for each fold
roc_auc_scores = []

# Iterate over each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Training on fold {fold+1}...")
    
    # Split data into training and validation sets
    X_train_fold, y_train_fold = X.iloc[train_idx], y_binary.iloc[train_idx]
    X_val_fold, y_val_fold = X.iloc[val_idx], y_binary.iloc[val_idx]
    
    # Prepare model input
    train_model_input = {name: X_train_fold[name] for name in feature_names}
    val_model_input = {name: X_val_fold[name] for name in feature_names}
    
    # Define the DeepFM model
    model = DeepFM(feature_columns, feature_columns, task='binary')
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(train_model_input, y_train_fold, batch_size=256, epochs=10, verbose=2)
    
    # Predict on the validation set
    y_pred_fold = model.predict(val_model_input, batch_size=256)
    
    # Calculate the ROC-AUC score and append to the list
    roc_auc = roc_auc_score(y_val_fold, y_pred_fold)
    roc_auc_scores.append(roc_auc)
    print(f"Fold {fold+1} ROC-AUC: {roc_auc}")

# Calculate the mean and standard deviation of the ROC-AUC scores
mean_roc_auc = np.mean(roc_auc_scores)
std_dev_roc_auc = np.std(roc_auc_scores)

print(f"Mean ROC-AUC: {mean_roc_auc}")
print(f"Standard Deviation of ROC-AUC: {std_dev_roc_auc}")

Training on fold 1...
Epoch 1/10
4753/4753 - 10s - loss: 0.0031 - accuracy: 0.9993
Epoch 2/10
4753/4753 - 9s - loss: 3.6922e-04 - accuracy: 0.9999
Epoch 3/10
4753/4753 - 9s - loss: 2.7276e-04 - accuracy: 0.9999
Epoch 4/10
4753/4753 - 9s - loss: 2.5517e-04 - accuracy: 0.9999
Epoch 5/10
4753/4753 - 9s - loss: 2.2940e-04 - accuracy: 0.9999
Epoch 6/10
4753/4753 - 9s - loss: 2.1873e-04 - accuracy: 0.9999
Epoch 7/10
4753/4753 - 9s - loss: 2.3861e-04 - accuracy: 0.9999
Epoch 8/10
4753/4753 - 9s - loss: 1.7260e-04 - accuracy: 1.0000
Epoch 9/10
4753/4753 - 9s - loss: 1.6092e-04 - accuracy: 1.0000
Epoch 10/10
4753/4753 - 9s - loss: 1.4550e-04 - accuracy: 1.0000
Fold 1 ROC-AUC: 0.9999987666140508
Training on fold 2...
Epoch 1/10
4753/4753 - 11s - loss: 0.0035 - accuracy: 0.9995
Epoch 2/10
4753/4753 - 10s - loss: 3.3133e-04 - accuracy: 0.9999
Epoch 3/10
4753/4753 - 10s - loss: 2.8857e-04 - accuracy: 0.9999
Epoch 4/10
4753/4753 - 10s - loss: 2.6692e-04 - accuracy: 0.9999
Epoch 5/10
4753/4753 - 10s 