In [21]:
import torch
import deepctr_torch
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
torch.cuda.is_available()

True

In [None]:
# data = pd.read_csv('./criteo_sample.txt')
sparse_features = ['C' + str(i) for i in range(1, 27)] # 稀疏矩阵
dense_features = ['I' + str(i) for i in range(1, 14)] # 密集矩阵
target = ['label']

# dtype_dict ={'label': int}
# dtype_dict.update({feat: str for feat in sparse_features})
# dtype_dict.update({feat: float for feat in dense_features})

data = pd.read_csv("train.txt", sep='\t', header=None, names=target + dense_features + sparse_features)
data.head()

ParserError: Error tokenizing data. C error: Expected 40 fields in line 27025611, saw 42


In [None]:
data[sparse_features] = data[sparse_features].fillna('-1', ) # NaN填充-1
data[dense_features] = data[dense_features].fillna(0, ) # NaNo填充0


print('数据大小',data.shape)
data.head()

In [None]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
data.head()

In [None]:
mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])
data.head()

定义特征配置

In [None]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=10,dtype='float32') # 嵌入维度10
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]
fixlen_feature_columns

In [None]:
fixlen_feature_columns[0]

In [None]:
# 获取模型输入特征名
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names.__len__()

In [None]:
# train, test = train_test_split(data, test_size=0.01)
# 构造模型输入
train_model_input = {name: data[name].values for name in feature_names}
# test_model_input = {name: test[name].values for name in feature_names}
train_model_input

In [None]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'



model = deepctr_torch.models.xDeepFM(
    linear_feature_columns=linear_feature_columns,
    dnn_feature_columns=dnn_feature_columns,
    dnn_hidden_units=[400, 400, 400, 400],  # 论文中提到的DNN层神经元数量
    cin_layer_size=[200, 200, 50],         # Criteo数据集上的CIN层设置?????
    cin_split_half=True,
    cin_activation='relu',
    l2_reg_linear=0.0001,              
    l2_reg_embedding=0.0001,
    l2_reg_dnn=0.0001,                 # DNN部分L2正则化
    l2_reg_cin=0.0001,                 # CIN部分L2正则化
    init_std=0.0001,
    seed=1024,
    dnn_dropout=0.0,                   
    dnn_activation='relu',
    dnn_use_bn=False,
    task='binary',
    device='cuda:0' if torch.cuda.is_available() else 'cpu'
)
model

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params}')

# 计算嵌入表参数
embedding_params = 0
linear_embedding_params = 0

print("\n--- Embedding Parameters Details ---")
for name, param in model.named_parameters():
    if 'embedding' in name:
        # print(f"{name}: {param.shape}") # 可选：打印每个嵌入表的形状
        if 'linear_model' in name:
            linear_embedding_params += param.numel()
        else:
            embedding_params += param.numel()

print(f"Deep Side Embedding Params: {embedding_params}")
print(f"Linear Side Embedding Params: {linear_embedding_params}")
print(f"Total Embedding Params: {embedding_params + linear_embedding_params}")
print(f"Embedding Ratio: {(embedding_params + linear_embedding_params)/total_params:.2%}")

In [None]:
from torch.utils.tensorboard import SummaryWriter
import torch.utils.data as Data
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import time

# 1. 配置
writer = SummaryWriter('./logs', comment="mean_dropout0_1223_")
batch_size = 4096
epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# loss_func = torch.nn.BCELoss(reduction='sum')
loss_func = torch.nn.BCELoss()
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# 2. 数据预处理 (完全复刻 BaseModel.fit 的逻辑)
# deepctr_torch 的核心逻辑是将所有特征拼接成一个大的 Tensor，然后内部根据 feature_index 切分
print("Processing data...")
if isinstance(train_model_input, dict):
    x_train_list = [train_model_input[feature] for feature in model.feature_index]
else:
    x_train_list = train_model_input

# 确保每个特征都是 2D 的 (N, 1)
for i in range(len(x_train_list)):
    if len(x_train_list[i].shape) == 1:
        x_train_list[i] = np.expand_dims(x_train_list[i], axis=1)

# 拼接成一个大的 Numpy Array [Samples, Features]
x_train_full = np.concatenate(x_train_list, axis=-1)
y_train_full = data[target].values

# 3. 划分训练集和验证集
x_train, x_val, y_train, y_val = train_test_split(
    x_train_full, y_train_full, test_size=0.2, random_state=2025, shuffle = False
)

# 4. 构建 DataLoader
# 使用 TensorDataset 和 DataLoader 是最稳健的方式，避免了手动切片的各种问题
train_dataset = Data.TensorDataset(
    torch.from_numpy(x_train), 
    torch.from_numpy(y_train)
)
train_loader = DataLoader(
    dataset=train_dataset, shuffle=True, batch_size=batch_size
)

val_dataset = Data.TensorDataset(
    torch.from_numpy(x_val), 
    torch.from_numpy(y_val)
)
val_loader = DataLoader(
    dataset=val_dataset, shuffle=False, batch_size=batch_size
)

print(f"Start training on {device}...")
print(f"Train samples: {len(x_train)}, Validation samples: {len(x_val)}")
global_step = 0

# 5. 训练循环
for epoch in range(epochs):
    # --- Training Phase ---
    model.train()
    train_loss_sum = 0.0
    train_BCEloss = 0.0
    train_preds = []
    train_targets = []
    
    with tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Train]') as pbar:
        for x, y in pbar:
            x = x.to(device).float() # deepctr_torch 要求输入为 float，内部会处理类型转换
            y = y.to(device).float()
            
            optimizer.zero_grad()
            y_pred = model(x).squeeze()
            
            loss = loss_func(y_pred, y.squeeze())
            
            # 加上正则化损失 (参考 BaseModel.fit)
            reg_loss = model.get_regularization_loss()
            total_loss = loss + reg_loss
            
            total_loss.backward()
            optimizer.step()
            
            # 记录
            current_loss = total_loss.item()
            train_loss_sum += current_loss
            train_BCEloss += loss.item()
            writer.add_scalar('TotalLoss/train_batch', current_loss, global_step)
            writer.add_scalar('Loss/train_batch', loss.item(), global_step)
            global_step += 1
            
            # 收集训练集预测结果
            train_preds.extend(y_pred.detach().cpu().numpy())
            train_targets.extend(y.cpu().numpy())
            
            pbar.set_postfix({'loss': f'{current_loss:.4f}'})

    avg_train_loss = train_loss_sum / len(train_loader)
    avg_train_BCEloss = train_BCEloss / len(train_loader)
    train_auc = roc_auc_score(train_targets, train_preds)
    train_acc = accuracy_score(train_targets, np.where(np.array(train_preds) >= 0.5, 1, 0))
    
    writer.add_scalar('Loss/train_epoch', avg_train_loss, epoch)
    writer.add_scalar('Metric/train_auc', train_auc, epoch)
    writer.add_scalar('Metric/train_acc', train_acc, epoch)
    writer.add_scalar('BCELoss/train_epoch', avg_train_BCEloss, epoch)

    # --- Validation Phase ---
    model.eval()
    val_loss_sum = 0.0
    val_preds = []
    val_targets = []
    
    # Reset max memory tracking for this epoch's validation
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        
    val_start_time = time.time()
    
    with torch.no_grad():
        for x, y in val_loader:
            x = x.to(device).float()
            y = y.to(device).float()
            
            y_pred = model(x).squeeze()
            loss = loss_func(y_pred, y.squeeze())
            val_loss_sum += loss.item()
            
            val_preds.extend(y_pred.cpu().numpy())
            val_targets.extend(y.cpu().numpy())

    val_end_time = time.time()
    val_inference_time = val_end_time - val_start_time
    
    if torch.cuda.is_available():
        max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 # MB
    else:
        max_memory = 0

    avg_val_loss = val_loss_sum / len(val_loader)
    val_auc = roc_auc_score(val_targets, val_preds)
    val_acc = accuracy_score(val_targets, np.where(np.array(val_preds) >= 0.5, 1, 0))
    
    print(f"Epoch {epoch+1} Result: Train Loss: {avg_train_loss:.4f} Train AUC: {train_auc:.4f} Train ACC:{train_acc:.4f}| Val Loss: {avg_val_loss:.4f} Val AUC: {val_auc:.4f} Val ACC:{val_acc:.4f}")
    print(f"Validation Inference Time: {val_inference_time:.4f}s | Peak GPU Memory: {max_memory:.2f} MB")
    
    writer.add_scalar('Loss/val_epoch', avg_val_loss, epoch)
    writer.add_scalar('Metric/val_auc', val_auc, epoch)
    writer.add_scalar('Metric/val_acc', val_acc, epoch)
    writer.add_scalar('Performance/val_inference_time', val_inference_time, epoch)
    writer.add_scalar('Performance/val_peak_memory', max_memory, epoch)

writer.close()
print("Training finished.")

torch.save(model.state_dict(), 'xDeepFM_criteo_drop0.pth')

In [None]:

# pred_ans = model.predict(test_model_input, batch_size=256)
# auc_score = roc_auc_score(test[target].values, pred_ans)
# print(f"Test AUC: {auc_score:.4f}")

In [None]:
model.compile(
    optimizer=torch.optim.Adam(model.parameters(), lr=0.001), # Adam，学习率0.001
    
    loss="binary_crossentropy",
    metrics=["binary_crossentropy", "auc"]
)

# 模型训练
history = model.fit(
    train_model_input,
    data[target].values,
    batch_size=4096,
    epochs=10,
    verbose=1,
    validation_split=0.2,
   ) 

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(1,11),history.history['auc'],label='train',marker='o')
plt.plot(range(1,11),history.history['val_auc'],label='val',marker='o')
plt.legend()
plt.xlabel('epoch')
plt.ylabel('auc')
plt.title('xDeepFM auc')
plt.savefig('xDeepFM_auc.png')
plt.show()

In [None]:




plt.plot(range(1,11),history.history['binary_crossentropy'],label='train',marker='o')
plt.plot(range(1,11),history.history['val_binary_crossentropy'],label='val',marker='o')
plt.legend()
plt.xlabel('epoch')
plt.ylabel('binary_crossentropy')
plt.title('xDeepFM binary_crossentropy')
plt.savefig('xDeepFM_binary_crossentropy.png')
plt.show()