In [47]:
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

In [11]:
# modify data

In [35]:
# load  data
import pandas as pd
# users

# items
sparse_features = ["keyword", "item_id", 'product', 'period', 'dtype', 'p_date']
dense_features = ["show_cnt", 'click_cnt', 'play_cnt', 'like_cnt',
       'follow_cnt', 'long_view_cnt', 'short_view_cnt', 'first_click_cnt',
       'last_click_cnt', 'first_view_long_cnt', 'last_view_long_cnt',
       'skip_cnt', 'exam_cnt', 'slide_show', 'slide_click']
qp = pd.read_csv("data/qp", sep="\t")
qp[sparse_feature].fillna(-1,)
qp[dense_feature].fillna(0,)
qp[sparse_feature]

# label
def get_xtr(x):
    if x["long_view_cnt"] > 0:
        return 1
    return 0
qp['has_long_view'] = qp.apply(get_xtr, axis=1)
data = qp
# rawdf = pd.read_csv("data/raw")

In [36]:
# preprocessing
for feat in sparse_features:
    lbe = LabelEncoder() # 相当于制作词表. fit_transform(word)->idx, inverse_transform(idx)->word
    data[feat] = lbe.fit_transform(data[feat])

mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [40]:
# Generate feature column
target = ['has_long_view']

sparse_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
                       for i,feat in enumerate(sparse_features)]

dense_feature_columns = [DenseFeat(feat, 1, )
                          for feat in dense_features]

deep_feature_columns = sparse_feature_columns + dense_feature_columns
wide_feature_columns = sparse_feature_columns

feature_names = get_feature_names(deep_feature_columns + wide_feature_columns)

In [43]:
# 3.generate input data for model
train, test = train_test_split(data, test_size=0.2)

train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [50]:
# 4.Define Model,train,predict and evaluate
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns=wide_feature_columns, dnn_feature_columns=deep_feature_columns,
               task='binary',
               l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )
model.fit(train_model_input,train[target].values,batch_size=32,epochs=10,verbose=2,validation_split=0.0)

pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cpu
Train on 786 samples, validate on 0 samples, 25 steps per epoch
Epoch 1/10
1s - loss:  0.6568 - binary_crossentropy:  0.6560 - auc:  0.9132
Epoch 2/10
1s - loss:  0.6152 - binary_crossentropy:  0.6130 - auc:  0.9950
Epoch 3/10
1s - loss:  0.4089 - binary_crossentropy:  0.4060 - auc:  0.9998
Epoch 4/10
1s - loss:  0.1309 - binary_crossentropy:  0.1300 - auc:  1.0000
Epoch 5/10
1s - loss:  0.0306 - binary_crossentropy:  0.0305 - auc:  1.0000
Epoch 6/10
1s - loss:  0.0117 - binary_crossentropy:  0.0116 - auc:  1.0000
Epoch 7/10
1s - loss:  0.0064 - binary_crossentropy:  0.0064 - auc:  1.0000
Epoch 8/10
1s - loss:  0.0042 - binary_crossentropy:  0.0042 - auc:  1.0000
Epoch 9/10
1s - loss:  0.0031 - binary_crossentropy:  0.0031 - auc:  1.0000
Epoch 10/10
1s - loss:  0.0024 - binary_crossentropy:  0.0024 - auc:  1.0000

test LogLoss 0.4942
test AUC 0.9215
