In [1]:
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

# 1. Load Dataset

demo先直接pandas读。
数据量大时，应该会分成多个文件，这时"统计词表"，sparse、dense特征以及特征域的维度都应以config形式传入；
然后直接根据词表构建embedding就行；

## 1.1 统计历史词表
这部分后面应提出主程序之外，磁盘上事先写好 word2idx, idx2word 两个表，主程序直接读取即可。  
这里使用5月15日的raw表

In [11]:
import numpy as np
import pandas as pd

feat2idx =  {'user_id': (0, 1),
             'keyword': (1, 2),
             'sequence_keyword': (2, 3),
             'search_source': (3, 4),
             'session_id': (4, 5),
             'item_id': (5, 6),
             'show_cnt': (6, 7),
             'click_cnt': (7, 8),
             'play_cnt': (8, 9),
             'like_cnt': (9, 10),
             'follow_cnt': (10, 11),
             'long_view_cnt': (11, 12),
             'short_view_cnt': (12, 13),
             'first_click': (13, 14),
             'last_click': (14, 15),
             'first_view': (15, 16),
             'last_view': (16, 17),
             'skip': (17, 18),
             'exam': (18, 19),
             'play_duration': (19, 20),
             'slide_show': (20, 21),
             'slide_click': (21, 22),
             'pos': (22, 23),
             'atlas_view_cnt': (23, 24),
             'download_cnt': (24, 25),
             'feed_model': (25, 26),
             'p_date': (26, 27),
             'product': (27, 28)}


sparse_features = ['user_id', 'keyword', 'sequence_keyword', 'search_source', 'session_id', 'item_id', 
                   'first_click', 'last_click', 'first_view', 'last_view',
                   'pos', 'feed_model', 'p_date', 'product']

dense_features = ['show_cnt', 'click_cnt', 'play_cnt', 'like_cnt', 'follow_cnt', 'long_view_cnt', 
                  'short_view_cnt', 'slide_show', 'slide_click', 'atlas_view_cnt']

data = pd.read_csv("data/raw/20210516", sep="\t", dtype={feat: str for feat in sparse_features}) 
## 缺失值处理
for feat in sparse_features + dense_features:
    if feat in sparse_features:
        data[feat] = data[feat].fillna("")
    else:
        data[feat] = data[feat].fillna(0)

## 离散特征编码
sparse_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
data[sparse_features] = sparse_encoder.fit_transform(data[sparse_features])

sparse_feature_info = {}
for fname, word_list in zip(sparse_features, sparse_encoder.categories_):
    vocab = {word: i for i, word in enumerate(np.concatenate((word_list, ["__UNK__"])))}
    sparse_feature_info[fname] = (feat2idx[fname], vocab)
dense_feature_info = {}
for fname in dense_features:
    dense_feature_info[fname] = feat2idx[fname]
    
# encoder_dict = {fname: {word: i
#                           for i, word in enumerate(
#                               np.concatenate((word_list, ["__UNK__"])))} # vocab中添加 __UNK__ token
#                  for fname, word_list in zip(sparse_features, sparse_encoder.categories_)}
# idx_encoder = [[feat2idx[name], vocab] for name, vocab in encoder_dict.items()]

# ## 连续特征归一化
# dense_encoder = MinMaxScaler(feature_range=(0,1))
# data[dense_features] = dense_encoder.fit_transform(data[dense_features])

## 1.2 制作 iterable dataset
输入: sparse_feature_info = {feature_name: (index, vocab), }

In [3]:
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, IterableDataset

def raw_iterator(files, sparse_feature_info):
    for file in files:
        with open(file, 'r') as f:
            feat_names = f.readline()
            lines = f.readlines()
        for line in lines:
            row = np.array(line.strip().split('\t'))
            # encoder
            # row[sparse_idx] = [vocab[row[i]] if row[i] in vocab else vocab["__UNK__"]
            #                    for i, vocab in idx_encoder]
            for feat, (index, vocab) in sparse_feature_info.items():
                idxs = list(range(*index))
                row[idxs] = [vocab[row[i]] if row[i] in vocab else vocab["__UNK__"]
                             for i in idxs]
            row = row.astype(float)
            yield row


class Raw_Dataset(IterableDataset):
    def __init__(self, iterator, label_idx):
        self.iterator = iterator
        self.label_idx = label_idx
    def __iter__(self):
        for x in self.iterator:
            y = np.array([1]) if x[self.label_idx] > 0 else np.array([0])
            yield {"features": x, "label": y}

iterator = raw_iterator(["data/raw/20210516"], sparse_feature_info)
ds = Raw_Dataset(iterator, feat2idx["click_cnt"][0])
loader = DataLoader(ds, batch_size=32)

# 2. 模型定义

## 2.1 feature_column定义
会用于
1. sparse feature 的 embedding 表定义
2. 模型 forward 的最开始时，从输入的 batch 中, 根据 feature_column 的下标分别抽出 wide_feature 和 deep_feature 对应的 tensor 列

In [20]:
from collections import OrderedDict, namedtuple, defaultdict

DEFAULT_GROUP_NAME = "default_group"

class SparseFeat(namedtuple('SparseFeat',
                            ['name', 'index', 'vocabulary_size', 'embedding_dim', 'dimension','sparse_embedding', 'dtype', 'embedding_name',
                             'group_name'])):
    __slots__ = ()

    def __new__(cls, name, index, vocabulary_size, embedding_dim=4, sparse_embedding=False, dtype="int32",
                embedding_name=None, group_name=DEFAULT_GROUP_NAME):
        if embedding_name is None:
            embedding_name = name
        if embedding_dim == "auto":
            embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
        return super(SparseFeat, cls).__new__(cls, name, index, vocabulary_size, embedding_dim, embedding_dim,
                                              sparse_embedding, dtype, embedding_name, group_name)


class VarLenSparseFeat(namedtuple('VarLenSparseFeat',
                                  ['sparsefeat', 'maxlen', 'index', 'combiner', 'length_name'])):
    __slots__ = ()

    def __new__(cls, sparsefeat, maxlen, index, combiner="mean", length_name=None):
        return super(VarLenSparseFeat, cls).__new__(cls, sparsefeat, maxlen, index, combiner, length_name)

    @property
    def name(self):
        return self.sparsefeat.name

    @property
    def vocabulary_size(self):
        return self.sparsefeat.vocabulary_size
    
    @property
    def sparse_embedding(self):
        return self.sparsefeat.sparse_embedding

    @property
    def embedding_dim(self):
        return self.sparsefeat.embedding_dim

    @property
    def dtype(self):
        return self.sparsefeat.dtype

    @property
    def embedding_name(self):
        return self.sparsefeat.embedding_name

class DenseFeat(namedtuple('DenseFeat', ['name', 'index', 'dimension', 'dtype'])):
    __slots__ = ()

    def __new__(cls, name, index, dimension=1, dtype="float32"):
        return super(DenseFeat, cls).__new__(cls, name, index, dimension, dtype)

    def __hash__(self):
        return self.name.__hash__()

In [31]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
from sklearn.metrics import *
from torch.utils.data import DataLoader
from tqdm import tqdm

def create_embedding_matrix(feature_columns, init_std=0.0001, linear=False, sparse=False, device='cpu'):
    # Return nn.ModuleDict: for sparse features, {embedding_name: nn.Embedding}
    # for varlen sparse features, {embedding_name: nn.EmbeddingBag}
    sparse_feature_columns = list(
        filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if len(feature_columns) else []

    varlen_sparse_feature_columns = list(
        filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if len(feature_columns) else []

    embedding_dict = nn.ModuleDict(
        {feat.embedding_name: nn.Embedding(feat.vocabulary_size, feat.embedding_dim if not linear else 1, sparse=sparse)
         for feat in
         sparse_feature_columns + varlen_sparse_feature_columns}
    )

    # for feat in varlen_sparse_feature_columns:
    #     embedding_dict[feat.embedding_name] = nn.EmbeddingBag(
    #         feat.dimension, embedding_size, sparse=sparse, mode=feat.combiner)

    for tensor in embedding_dict.values():
        nn.init.normal_(tensor.weight, mean=0, std=init_std)

    return embedding_dict.to(device)

def activation_layer(act_name):
    if isinstance(act_name, str):
        if act_name.lower() == 'sigmoid':
            act_layer = nn.Sigmoid()
        elif act_name.lower() == 'linear':
            act_layer = Identity()
        elif act_name.lower() == 'relu':
            act_layer = nn.ReLU(inplace=True)
        elif act_name.lower() == 'prelu':
            act_layer = nn.PReLU()
    elif issubclass(act_name, nn.Module):
        act_layer = act_name()
    else:
        raise NotImplementedError

    return act_layer

class DNN(nn.Module):

    def __init__(self, inputs_dim, hidden_units, activation='relu', dropout_rate=0, init_std=0.0001):
        super(DNN, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        hidden_units = [inputs_dim] + list(hidden_units)

        self.linears = nn.ModuleList(
            [nn.Linear(hidden_units[i], hidden_units[i + 1]) for i in range(len(hidden_units) - 1)])

        self.activation_layers = nn.ModuleList(
            [activation_layer(activation) for i in range(len(hidden_units) - 1)])

        for name, tensor in self.linears.named_parameters():
            if 'weight' in name:
                nn.init.normal_(tensor, mean=0, std=init_std)

    def forward(self, inputs):
        x = inputs
        for i in range(len(self.linears)):
            x = self.linears[i](x)
            x = self.activation_layers[i](x)
            x = self.dropout(x)
        return x


class Linear(nn.Module):
    def __init__(self, sparse_feature_columns, dense_feature_columns, 
                 init_std=0.0001, device='cpu'):
        super(Linear, self).__init__()
        self.sparse_feature_columns = sparse_feature_columns
        self.dense_feature_columns = dense_feature_columns

        self.embedding_dict = create_embedding_matrix(sparse_feature_columns)

        for tensor in self.embedding_dict.values():
            nn.init.normal_(tensor.weight, mean=0, std=init_std)

        if len(self.dense_feature_columns) > 0:
            self.weight = nn.Parameter(torch.Tensor(sum(fc.dimension for fc in self.dense_feature_columns), 1).to(
                device))
            torch.nn.init.normal_(self.weight, mean=0, std=init_std)

    def forward(self, X):
        
        dense_value_list = [inputs[:, feat.index[0]:feat.index[1]] 
                          for feat in self.dense_feature_columns]

        sparse_embedding_list = [
            self.embedding_dict[feat.name]( # 取出 embedding 表
                inputs[:, feat.index[0]:feat.index[1]])
            for feat in self.sparse_feature_columns]
        
#         sparse_embedding_list = [self.embedding_dict[feat.embedding_name](
#             X[:, self.feature_index[feat.name][0]:self.feature_index[feat.name][1]].long()) for
#             feat in self.sparse_feature_columns]

#         dense_value_list = [X[:, self.feature_index[feat.name][0]:self.feature_index[feat.name][1]] for feat in
#                             self.dense_feature_columns]

        linear_logit = torch.zeros([X.shape[0], 1]).to(sparse_embedding_list[0].device)
        if len(sparse_embedding_list) > 0:
            sparse_embedding_cat = torch.cat(sparse_embedding_list, dim=-1)
            sparse_feat_logit = torch.sum(sparse_embedding_cat, dim=-1, keepdim=False)
            linear_logit += sparse_feat_logit
        if len(dense_value_list) > 0:
            dense_value_logit = torch.cat(
                dense_value_list, dim=-1).matmul(self.weight)
            linear_logit += dense_value_logit

        return linear_logit

    
class FM(nn.Module):

    def __init__(self):
        super(FM, self).__init__()

    def forward(self, inputs):
        fm_input = inputs

        square_of_sum = torch.pow(torch.sum(fm_input, dim=1, keepdim=True), 2)
        sum_of_square = torch.sum(fm_input * fm_input, dim=1, keepdim=True)
        cross_term = square_of_sum - sum_of_square
        cross_term = 0.5 * torch.sum(cross_term, dim=2, keepdim=False)

        return cross_term


class DeepFM(nn.Module):
    def __init__(self, sparse_feature_columns, dense_feature_columns,
                 dnn_hidden_units=(256, 128), dnn_activation='ReLU', dnn_dropout=0,
                 init_std=0.0001, seed=1173, device='cpu'):
        super(DeepFM, self).__init__()
        torch.manual_seed(seed)
        self.sparse_feature_columns = sparse_feature_columns
        self.dense_feature_columns = dense_feature_columns
        
        self.embedding_dict = create_embedding_matrix(sparse_feature_columns)
        # 正则项...
        
        # DNN
        dnn_input_dim = sum(feat.dimension for feat in sparse_feature_columns + dense_feature_columns)
        self.dnn = DNN(dnn_input_dim, dnn_hidden_units,
               activation=dnn_activation, dropout_rate=dnn_dropout)
        
        # FM
        self.linear = Linear(sparse_feature_columns, dense_feature_columns)
        self.fm = FM()
        
        self.to(device)
    
    def split_tensor(self, inputs):

        dense_value_list = [inputs[:, feat.index[0]:feat.index[1]] 
                          for feat in self.dense_feature_columns]

        sparse_embedding_list = [
            self.embedding_dict[feat.name]( # 取出 embedding 表
                inputs[:, feat.index[0]:feat.index[1]])
            for feat in self.sparse_feature_columns]

        return sparse_embedding_list, dense_value_list
    
    def forward(self, inputs):
        sparse_embedding_list, dense_value_list = self.split_tensor(inputs)
        return sparse_embedding_List, dense_value_list
#         level1_logit = self.linear_model(X)

#         if len(sparse_embedding_list) > 0:
#             fm_input = torch.cat(sparse_embedding_list, dim=1)
#             logit += self.fm(fm_input)

#         dnn_input = combined_dnn_input(
#             sparse_embedding_list, dense_value_list)
#         dnn_output = self.dnn(dnn_input)
#         dnn_logit = self.dnn_linear(dnn_output)
#         logit += dnn_logit

#         y_pred = self.out(logit)

#         return y_pred

In [37]:
sparse_feature_columns = [SparseFeat(name, index, len(vocab), 4, False)
                          for name, (index, vocab) in sparse_feature_info.items()]
dense_feature_columns = [DenseFeat(name, index) for name, index in dense_feature_info.items()]

model = DeepFM(sparse_feature_columns, dense_feature_columns)

In [40]:
name, (index, vocab) = list(sparse_feature_info.items())[0]
name

'user_id'

In [33]:
for inputs in loader:
    sparse, dense = model(inputs)
    break

TypeError: unhashable type: 'slice'