In [1]:
import os
import csv

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, IterableDataset

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, MinMaxScaler
from sklearn import metrics as sk_metrics
import numpy as np
import pandas as pd

from components.feature import *
from components.dataset import raw_iterator, RawDataset
from components.deepfm import DeepFM


feat2idx =  {'user_id': (0, 1),
             'keyword': (1, 2),
             'sequence_keyword': (2, 3),
             'search_source': (3, 4),
             'session_id': (4, 5),
             'item_id': (5, 6),
             'show_cnt': (6, 7),
             'click_cnt': (7, 8),
             'play_cnt': (8, 9),
             'like_cnt': (9, 10),
             'follow_cnt': (10, 11),
             'long_view_cnt': (11, 12),
             'short_view_cnt': (12, 13),
             'first_click': (13, 14),
             'last_click': (14, 15),
             'first_view': (15, 16),
             'last_view': (16, 17),
             'skip': (17, 18),
             'exam': (18, 19),
             'play_duration': (19, 20),
             'slide_show': (20, 21),
             'slide_click': (21, 22),
             'pos': (22, 23),
             'atlas_view_cnt': (23, 24),
             'download_cnt': (24, 25),
             'feed_model': (25, 26),
             'p_date': (26, 27),
             'product': (27, 28)}


sparse_features = ['user_id', 'keyword', 'sequence_keyword', 'search_source', 'session_id', 'item_id',
                   'first_click', 'last_click', 'first_view', 'last_view',
                   'pos', 'feed_model', 'p_date', 'product']

dense_features = ['show_cnt', 'click_cnt', 'play_cnt', 'like_cnt', 'follow_cnt', 'long_view_cnt',
                  'short_view_cnt', 'slide_show', 'slide_click', 'atlas_view_cnt']

sparse_embedding_feature = set(["item_id", "user_id", "keyword", "session_id", "pos"])

data = pd.read_csv("data/raw/20210516", sep="\t", dtype={feat: str for feat in sparse_features},
                   error_bad_lines=False, quoting=csv.QUOTE_NONE, encoding='utf-8')
## 缺失值处理
for feat in sparse_features + dense_features:
    if feat in sparse_features:
        data[feat] = data[feat].fillna("")
    else:
        data[feat] = data[feat].fillna(0)

## 离散特征编码
sparse_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
data[sparse_features] = sparse_encoder.fit_transform(data[sparse_features])

sparse_feature_info = {}
for fname, word_list in zip(sparse_features, sparse_encoder.categories_):
    vocab = {word: i for i, word in enumerate(np.concatenate((word_list, ["__OOV__"])))}
    sparse_feature_info[fname] = {'index': feat2idx[fname],
                                  'vocab': vocab,
                                  'is_sparse': False if fname not in sparse_embedding_feature else True}
dense_feature_info = {}
for fname in dense_features:
    dense_feature_info[fname] = feat2idx[fname]
# ------------------------------------------------------------------------------------------

In [2]:
import json
feat2idx =  {'user_id': (0, 1),
             'keyword': (1, 2),
             'sequence_keyword': (2, 3),
             'search_source': (3, 4),
             'session_id': (4, 5),
             'item_id': (5, 6),
             'show_cnt': (6, 7),
             'click_cnt': (7, 8),
             'play_cnt': (8, 9),
             'like_cnt': (9, 10),
             'follow_cnt': (10, 11),
             'long_view_cnt': (11, 12),
             'short_view_cnt': (12, 13),
             'first_click': (13, 14),
             'last_click': (14, 15),
             'first_view': (15, 16),
             'last_view': (16, 17),
             'skip': (17, 18),
             'exam': (18, 19),
             'play_duration': (19, 20),
             'slide_show': (20, 21),
             'slide_click': (21, 22),
             'pos': (22, 23),
             'atlas_view_cnt': (23, 24),
             'download_cnt': (24, 25),
             'feed_model': (25, 26),
             'p_date': (26, 27),
             'product': (27, 28)}
for k, v in sparse_feature_info.items():
    print(f"{k:20}", len(v['vocab']))
json.dump(sparse_feature_info, open("data/vocab/20210516_sparse", 'w'), indent=2)
json.dump(dense_feature_info, open("data/vocab/20210516_dense", 'w'), indent=2)
json.dump(feat2idx, open("data/vocab/feat2idx", 'w'))

user_id              109962
keyword              189442
sequence_keyword     2
search_source        21
session_id           339478
item_id              3009648
first_click          3
last_click           3
first_view           3
last_view            3
pos                  886
feed_model           4
p_date               2
product              2


In [4]:
import json
import os
import csv

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, IterableDataset, BufferedShuffleDataset

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, MinMaxScaler
from sklearn import metrics as sk_metrics
import numpy as np
import pandas as pd

from components.feature import *
from components.dataset import raw_iterator, RawDataset
from components.deepfm import DeepFM

sparse_feature_info = json.load(open("data/vocab/20210516_sparse", 'r'))
dense_feature_info = json.load(open("data/vocab/20210516_dense", 'r'))
feat2idx = json.load(open("data/vocab/feat2idx", 'r'))
sparse_embedding_feature = set(["item_id", "user_id", "keyword", "session_id", "pos"])

In [5]:
files = [os.path.join("data/raw", file) for file in os.listdir("data/raw") if file.startswith("20210516_")]
ds = RawDataset(files, sparse_feature_info, feat2idx["click_cnt"][0])
ds =  BufferedShuffleDataset(ds, 10000)
loader = DataLoader(ds, batch_size=320, num_workers=3)

sparse_feature_columns = [SparseFeat(name, v['index'], len(v['vocab']), 4, v['is_sparse'])
                          for name, v in sparse_feature_info.items()]
dense_feature_columns = [DenseFeat(name, index) for name, index in dense_feature_info.items()]

model = DeepFM(sparse_feature_columns, dense_feature_columns)

In [7]:
import time
sparse_embedding_params = [list(getattr(model.embedding_dict, fname).parameters())[0]
                           for fname in sparse_embedding_feature]
dense_embedding_params = [list(getattr(model.embedding_dict, fname).parameters())[0]
                          for fname in sparse_feature_info if fname not in sparse_embedding_feature]
optim1 = torch.optim.SparseAdam(sparse_embedding_params)
optim2 = torch.optim.Adam(dense_embedding_params)
metric = sk_metrics.roc_auc_score
loss_func = F.binary_cross_entropy

i = 0
pre = time.time()
for batch in loader:
    inputs, y = batch['features'], batch['label'].squeeze()

    y_pred = model(inputs).squeeze()
    optim1.zero_grad()
    optim2.zero_grad()
    loss = loss_func(y_pred, y,reduction='sum')
    loss.backward()
    optim1.step()
    optim2.step()
    i += 1
    if i % 50 == 0:
        now = time.time()
        print(f"loss: {loss.item():<20}, cost time: {now-pre:<20}")
        pre = now
    # cpu: 大概15秒一个batch, 320个样本, 500w行大概跑78分钟
    # gpu: torch.cuda.is_available() = False?

loss:      192.43115234375, cost time: 17.666489124298096
loss:   160.72447204589844, cost time: 15.696096181869507
loss:    141.7527618408203, cost time: 13.71559762954712
loss:   119.23631286621094, cost time: 14.293400764465332
loss:    97.63081359863281, cost time: 17.603123664855957


KeyboardInterrupt: 

In [15]:
ds_test = RawDataset(["data/raw/20210517"], sparse_feature_info, feat2idx["click_cnt"][0])
loader_test = DataLoader(ds_test, batch_size=1000, num_workers=0)
for batch in loader_test:
    inputs, y = batch['features'], batch['label'].squeeze()
    y_pred = model(inputs).squeeze()
    auc = metric(y.cpu().unsqueeze(dim=-1).data.numpy(), y_pred.cpu().unsqueeze(dim=-1).data.numpy())
    print(f"auc: {auc}")

auc: 0.7452675722726655
auc: 0.841092192053996
auc: 0.7003917125086049
auc: 0.889619487045374
auc: 0.8051593753079122
auc: 0.6835772357723577
auc: 0.7735688671969556
auc: 0.7829902366521595
auc: 0.8357759999999999
auc: 0.8384225217864925
