In [2]:
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

### 0.加载数据

In [119]:
data = pd.read_csv('./criteo_sample.txt')
data

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,,3,260.0,,17668.0,,,33.0,,...,e5ba7672,87c6f83c,,,0429f84b,,3a171ecb,c0d61a5c,,
1,0,,-1,19.0,35.0,30251.0,247.0,1.0,35.0,160.0,...,d4bb7bd8,6fc84bfb,,,5155d8a3,,be7c41b4,ded4aac9,,
2,0,0.0,0,2.0,12.0,2013.0,164.0,6.0,35.0,523.0,...,e5ba7672,675c9258,,,2e01979f,,bcdee96c,6d5d1302,,
3,0,,13,1.0,4.0,16836.0,200.0,5.0,4.0,29.0,...,e5ba7672,52e44668,,,e587c466,,32c7478e,3b183c5c,,
4,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,...,e5ba7672,25c88e42,21ddcdc9,b1252a9d,0e8585d2,,32c7478e,0d4a6d1a,001f3601,92c878de
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,,0,113.0,3.0,3036.0,575.0,2.0,3.0,214.0,...,07c540c4,9880032b,21ddcdc9,5840adea,34cc61bb,c9d4222a,32c7478e,e5ed7da2,ea9a246c,984e0db0
196,1,0.0,1,1.0,1.0,1607.0,12.0,1.0,12.0,15.0,...,1e88c74f,3972b4ed,,,d1aa4512,,32c7478e,9257f75f,,
197,1,1.0,0,6.0,3.0,0.0,0.0,19.0,3.0,3.0,...,3486227d,5aed7436,54591762,a458ea53,4a2c3526,,32c7478e,1793a828,e8b83407,1a02cbe1
198,0,0.0,22,6.0,22.0,203.0,153.0,80.0,18.0,508.0,...,3486227d,13145934,55dd3565,5840adea,bf647035,,32c7478e,1481ceb4,e8b83407,988b0775


In [4]:
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

### 1.Label Encoding for sparse features,and do simple Transformation for dense features

In [5]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
    
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

### 2.count #unique features for each sparse field,and record dense feature field name

In [6]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                          for feat in sparse_features] + [DenseFeat(feat, 1, )
                                                          for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(
    linear_feature_columns + dnn_feature_columns)

In [118]:
## 特征名称
dnn_feature_columns

[SparseFeat(name='C1', vocabulary_size=27, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C1', group_name='default_group'),
 SparseFeat(name='C2', vocabulary_size=92, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C2', group_name='default_group'),
 SparseFeat(name='C3', vocabulary_size=172, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C3', group_name='default_group'),
 SparseFeat(name='C4', vocabulary_size=157, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C4', group_name='default_group'),
 SparseFeat(name='C5', vocabulary_size=12, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C5', group_name='default_group'),
 SparseFeat(name='C6', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C6', group_name='default_group'),
 SparseFeat(name='C7', vocabulary_size=183, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C7', group_name='default_group'),
 SparseFeat

In [120]:
feature_names

['C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21',
 'C22',
 'C23',
 'C24',
 'C25',
 'C26',
 'I1',
 'I2',
 'I3',
 'I4',
 'I5',
 'I6',
 'I7',
 'I8',
 'I9',
 'I10',
 'I11',
 'I12',
 'I13']

### 3.generate input data for model

In [9]:
train, test = train_test_split(data, test_size=0.2)

In [125]:
train.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
85,1,0.0,0.101599,0.000355,0.0,0.026805,0.0,0.003322,0.0,0.0,...,4,109,5,1,152,0,0,78,1,17
151,0,0.0,0.07928,0.000355,0.011494,0.009104,0.025166,0.056478,0.326531,0.263056,...,4,85,18,3,91,3,2,41,18,71
197,1,0.027027,0.000333,0.002131,0.034483,0.0,0.0,0.063123,0.061224,0.002901,...,4,40,17,2,41,0,0,12,16,11
36,0,0.0,0.000666,0.004618,0.022989,0.117999,0.138651,0.0,0.040816,0.084139,...,0,69,18,2,33,0,0,35,17,24
127,0,0.0,0.0,0.0,0.0,0.003034,0.054606,0.056478,0.408163,0.266925,...,8,69,33,3,29,0,0,64,11,12


In [124]:
test.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
195,0,0.0,0.000333,0.040142,0.034483,0.005984,0.273029,0.006645,0.061224,0.206963,...,0,74,5,1,30,5,0,118,17,48
76,0,0.0,0.000666,0.0,0.0,0.05738,0.0,0.0,0.0,0.0,...,8,20,0,0,1,4,0,32,0,0
139,0,0.0,0.000666,0.002487,0.091954,0.008872,0.087369,0.006645,0.081633,0.17795,...,0,95,21,3,65,0,6,14,11,3
166,0,0.0,0.004997,0.005329,0.126437,0.008097,0.059354,0.013289,0.714286,0.10735,...,8,88,0,0,166,5,8,121,0,0
101,0,0.0,0.000333,0.001421,0.0,0.022735,0.0,0.0,0.0,0.000967,...,1,116,5,2,62,3,0,108,10,43


In [104]:
# 输入类型为字典类型:{特征１：值，特征２：值}
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [98]:
isinstance (train_model_input, dict)

True

In [121]:
train_model_input

{'C1': 85     11
 151    25
 197     0
 36     11
 127    11
        ..
 35      0
 40     24
 64     16
 154     0
 165     0
 Name: C1, Length: 160, dtype: int64, 'C2': 85      3
 151    50
 197     5
 36     13
 127    13
        ..
 35     30
 40     25
 64     32
 154     6
 165    59
 Name: C2, Length: 160, dtype: int64, 'C3': 85      77
 151    160
 197    153
 36     130
 127    107
       ... 
 35      61
 40      34
 64      50
 154     82
 165    146
 Name: C3, Length: 160, dtype: int64, 'C4': 85      49
 151      6
 197    143
 36     155
 127     93
       ... 
 35     136
 40      62
 64      94
 154    128
 165    117
 Name: C4, Length: 160, dtype: int64, 'C5': 85     1
 151    1
 197    1
 36     5
 127    1
       ..
 35     9
 40     6
 64     1
 154    5
 165    1
 Name: C5, Length: 160, dtype: int64, 'C6': 85     6
 151    2
 197    0
 36     0
 127    3
       ..
 35     4
 40     3
 64     5
 154    5
 165    4
 Name: C6, Length: 160, dtype: int64, 'C7': 85     13

In [117]:
x = [train_model_input[feature] for feature in feature_names]
x[1]

85      3
151    50
197     5
36     13
127    13
       ..
35     30
40     25
64     32
154     6
165    59
Name: C2, Length: 160, dtype: int64

In [115]:
len(x)

39

In [108]:
# 切分数据
for array in x:
    print(array)
    print("切分数据！")
    print(array[100:102])
    break

85     11
151    25
197     0
36     11
127    11
       ..
35      0
40     24
64     16
154     0
165     0
Name: C1, Length: 160, dtype: int64
切分数据！
138    11
161     0
Name: C1, dtype: int64


In [109]:
for i in range(len(x)):
    if len(x[i].shape) == 1:
        x[i] = np.expand_dims(x[i], axis=1)

In [113]:
x[1].shape

(160, 1)

In [88]:
import numpy as np
X = np.concatenate(x, axis=-1)
X.shape

(160, 39)

### 4.Define Model,train,predict and evaluate

In [24]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

# model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
#                task='binary',
#                l2_reg_embedding=1e-5, device=device)
model = WDL(linear_feature_columns, dnn_feature_columns)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )
model.fit(train_model_input,train[target].values,batch_size=32,epochs=10,verbose=2,validation_split=0.0)

pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cuda ready...
cpu
Train on 160 samples, validate on 0 samples, 5 steps per epoch
Epoch 1/10
0s - loss:  0.5990 - binary_crossentropy:  0.5990 - auc:  0.6504
Epoch 2/10
0s - loss:  0.4579 - binary_crossentropy:  0.4579 - auc:  0.9717
Epoch 3/10
0s - loss:  0.2952 - binary_crossentropy:  0.2952 - auc:  0.9985
Epoch 4/10
0s - loss:  0.1802 - binary_crossentropy:  0.1802 - auc:  0.9990
Epoch 5/10
0s - loss:  0.1349 - binary_crossentropy:  0.1349 - auc:  1.0000
Epoch 6/10
0s - loss:  0.1112 - binary_crossentropy:  0.1112 - auc:  1.0000
Epoch 7/10
0s - loss:  0.0968 - binary_crossentropy:  0.0968 - auc:  1.0000
Epoch 8/10
0s - loss:  0.0839 - binary_crossentropy:  0.0839 - auc:  1.0000
Epoch 9/10
0s - loss:  0.0738 - binary_crossentropy:  0.0738 - auc:  1.0000
Epoch 10/10
0s - loss:  0.0663 - binary_crossentropy:  0.0663 - auc:  1.0000

test LogLoss 1.0295
test AUC 0.4265


In [25]:
model

WDL(
  (embedding_dict): ModuleDict(
    (C1): Embedding(27, 4)
    (C10): Embedding(142, 4)
    (C11): Embedding(173, 4)
    (C12): Embedding(170, 4)
    (C13): Embedding(166, 4)
    (C14): Embedding(14, 4)
    (C15): Embedding(170, 4)
    (C16): Embedding(168, 4)
    (C17): Embedding(9, 4)
    (C18): Embedding(127, 4)
    (C19): Embedding(44, 4)
    (C2): Embedding(92, 4)
    (C20): Embedding(4, 4)
    (C21): Embedding(169, 4)
    (C22): Embedding(6, 4)
    (C23): Embedding(10, 4)
    (C24): Embedding(125, 4)
    (C25): Embedding(20, 4)
    (C26): Embedding(90, 4)
    (C3): Embedding(172, 4)
    (C4): Embedding(157, 4)
    (C5): Embedding(12, 4)
    (C6): Embedding(7, 4)
    (C7): Embedding(183, 4)
    (C8): Embedding(19, 4)
    (C9): Embedding(2, 4)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (C1): Embedding(27, 1)
      (C10): Embedding(142, 1)
      (C11): Embedding(173, 1)
      (C12): Embedding(170, 1)
      (C13): Embedding(166, 1)
      (C14): Embeddi

In [26]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
               task='binary',
               l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )
model.fit(train_model_input,train[target].values,batch_size=32,epochs=10,verbose=2,validation_split=0.0)

pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cuda ready...
cuda:0
Train on 160 samples, validate on 0 samples, 5 steps per epoch
Epoch 1/10
0s - loss:  0.6154 - binary_crossentropy:  0.6154 - auc:  0.5331
Epoch 2/10
0s - loss:  0.4784 - binary_crossentropy:  0.4784 - auc:  0.9588
Epoch 3/10
0s - loss:  0.3269 - binary_crossentropy:  0.3269 - auc:  0.9954
Epoch 4/10
0s - loss:  0.1571 - binary_crossentropy:  0.1571 - auc:  1.0000
Epoch 5/10
0s - loss:  0.0810 - binary_crossentropy:  0.0810 - auc:  1.0000
Epoch 6/10
0s - loss:  0.0542 - binary_crossentropy:  0.0542 - auc:  1.0000
Epoch 7/10
0s - loss:  0.0386 - binary_crossentropy:  0.0386 - auc:  1.0000
Epoch 8/10
0s - loss:  0.0278 - binary_crossentropy:  0.0278 - auc:  1.0000
Epoch 9/10
0s - loss:  0.0220 - binary_crossentropy:  0.0220 - auc:  1.0000
Epoch 10/10
0s - loss:  0.0175 - binary_crossentropy:  0.0174 - auc:  1.0000

test LogLoss 1.16
test AUC 0.3978


In [28]:
model

DeepFM(
  (embedding_dict): ModuleDict(
    (C1): Embedding(27, 4)
    (C10): Embedding(142, 4)
    (C11): Embedding(173, 4)
    (C12): Embedding(170, 4)
    (C13): Embedding(166, 4)
    (C14): Embedding(14, 4)
    (C15): Embedding(170, 4)
    (C16): Embedding(168, 4)
    (C17): Embedding(9, 4)
    (C18): Embedding(127, 4)
    (C19): Embedding(44, 4)
    (C2): Embedding(92, 4)
    (C20): Embedding(4, 4)
    (C21): Embedding(169, 4)
    (C22): Embedding(6, 4)
    (C23): Embedding(10, 4)
    (C24): Embedding(125, 4)
    (C25): Embedding(20, 4)
    (C26): Embedding(90, 4)
    (C3): Embedding(172, 4)
    (C4): Embedding(157, 4)
    (C5): Embedding(12, 4)
    (C6): Embedding(7, 4)
    (C7): Embedding(183, 4)
    (C8): Embedding(19, 4)
    (C9): Embedding(2, 4)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (C1): Embedding(27, 1)
      (C10): Embedding(142, 1)
      (C11): Embedding(173, 1)
      (C12): Embedding(170, 1)
      (C13): Embedding(166, 1)
      (C14): Embe