In [46]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score,roc_auc_score

### 1. 定义模型

In [47]:
class DeepFM(nn.Module):
    def __init__(self, feature_sizes, embedding_dim=10, hidden_dims=[64, 32], num_classes=1,dropout = 0.2):
        """
        feature_sizes: 每个特征的唯一值数量列表
        embedding_dim: 嵌入维度
        hidden_dims: DNN隐藏层维度列表
        num_classes: 输出维度（二分类为1）
        """
        super(DeepFM, self).__init__()
        self.feature_sizes = feature_sizes
        self.num_fields = len(feature_sizes)
        self.embedding_dim = embedding_dim

        #FM 一阶项
        self.linear = nn.Embedding(sum(feature_sizes) + 1, 1)

        #FM/DNN 共享嵌入层
        self.embedding = nn.ModuleList([
            nn.Embedding(dim, embedding_dim)
            for dim in feature_sizes
        ])

        #DNN 部分
        dnn_input_dim = self.num_fields * embedding_dim
        self.dnn = nn.Sequential()
        for i, hidden_dim in enumerate(hidden_dims):
            self.dnn.add_module(
                name = f"fc_{i}",
                module = nn.Linear(dnn_input_dim, hidden_dim)
            )
            self.dnn.add_module(
                name = f"bn_{i}",
                module = nn.BatchNorm1d(hidden_dim)
            )

            self.dnn.add_module(
                name = f"relu_{i}",
                module = nn.ReLU()
            )
            self.dnn.add_module(
                name = f"dropout_{i}",
                module = nn.Dropout(dropout)
            )
            dnn_input_dim = hidden_dim

        #最终输出层
        self.dnn_output = nn.Linear(hidden_dims[-1], num_classes)

        #初始化权重
        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules(): 
            if isinstance(m, nn.Embedding):
                nn.init.xavier_normal_(m.weight) #使用Xavier初始化
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)#原地修改
                nn.init.constant_(m.bias, 0) 
            
    def forward(self,x):
        """
        x: 输入特征 [batch_size, num_fields] (LongTensor)
        """

        #FM 一阶项
        linear_part = torch.sum(self.linear(x), dim = 1) #[batch_size, 1]

        #获取嵌入向量
        embeds = []
        for i in range(self.num_fields):
            embed = self.embedding[i](x[:, i]) #[batch_size, embedding_dim]
            embeds.append(embed)
            
        #FM 二阶段项,注意这里采用化简公式计算
        fm_second_order = 0
        sum_embed = torch.stack(embeds, dim = 1) #[batch_size, num_fields, embedding_dim]
        sum_embed = torch.sum(sum_embed, dim = 1) #[batch_size, embedding_dim],其实就是将所有的特征与给加起来

        square_of_sum = sum_embed.pow(2) #[batch_size, embedding_dim]

        sum_of_square = torch.stack([e.pow(2) for e in embeds],dim = 1).sum(dim = 1) #[batch_size, embedding_dim]
        fm_second_order = 0.5 * (square_of_sum - sum_of_square).sum(dim = 1, keepdim = True) #[batch_size, 1]

        #DNN部分
        dnn_input = torch.cat(embeds, dim = 1) #[batch_size, num_fields * embedding_dim]
        dnn_output = self.dnn(dnn_input) #[batch_size, hidden_dims[-1]]
        dnn_final_output = self.dnn_output(dnn_output) #[batch_size, num_classes(二分类为1)]

        #输出
        output = linear_part + fm_second_order + dnn_final_output #[batch_size, num_classes(二分类为1)]
        return torch.sigmoid(output).squeeze(1)


### 2. 构建数据集

In [48]:
class CustomDataset(Dataset):
    def __init__(self, num_samples, feature_sizes):
        self.num_samples = num_samples
        self.feature_sizes = feature_sizes
        
        # 生成数值型特征数据
        self.num_data = torch.randn(num_samples, len(feature_sizes)).float()
        
        # 生成类别型特征数据
        self.cat_data = torch.cat([
            torch.randint(0, dim, (num_samples, 1)) for dim in feature_sizes
        ], dim=1)

        # 生成标签
        self.labels = torch.randint(0, 2, (num_samples,)).float()

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return self.num_data[idx], self.cat_data[idx], self.labels[idx]

### 3. 模型配置

In [49]:
# 定义配置
config = {
    "feature_sizes" : [10, 20, 5],
    "embedding_dim": 16,               # 嵌入维度
    "hidden_dims": [128, 64,32],        # 深度网络隐藏层维度列表
    "dropout": 0.3,                   # Dropout率
    "batch_size": 32,                 # 批量大小
    "num_epochs": 30,                 # 训练轮数
    "num_samples": 1000,              # 数据集样本数量
    "learning_rate": 0.005,            # 学习率
    "weight_decay" : 0.001              #l2正则化
}


In [50]:
# 初始化数据集和数据加载器
train_dataset = CustomDataset(config["num_samples"], config["feature_sizes"])
train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 初始化模型并移至设备
model = DeepFM(
    feature_sizes=config['feature_sizes'],
    embedding_dim=config['embedding_dim'],
    hidden_dims=config['hidden_dims'],
    dropout=config['dropout']
).to(device)


# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"],weight_decay=config["weight_decay"])

### 4. 模型训练

In [51]:
from tqdm import tqdm  # 导入 tqdm 库

# 训练过程
pbar = tqdm(range(config["num_epochs"]), desc="Training Progress", unit="epoch") 
for epoch in pbar:
    model.train()  
    running_loss = 0.0
    
    for continuous, categorical, labels in train_loader:
        continuous, categorical, labels = continuous.to(device), categorical.to(device), labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(categorical)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * continuous.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)

    # 使用 set_postfix 更新进度条上的损失
    pbar.set_postfix(loss=epoch_loss)



Training Progress: 100%|██████████| 30/30 [00:02<00:00, 14.53epoch/s, loss=0.567]
