In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import  SparseFeat, DenseFeat,get_feature_names
import torch
data = pd.read_csv('./criteo_sample.txt')

sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I'+str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0,)
target = ['label']

In [4]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [5]:
mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [6]:
sparse_feature_columns = [SparseFeat(feat, data[feat].nunique())
                        for feat in sparse_features]
dense_feature_columns = [DenseFeat(feat, 1)
                      for feat in dense_features]

In [7]:
sparse_feature_columns = [SparseFeat(feat, dimension=1e6,use_hash=True) for feat in sparse_features]#The dimension can be set according to data
dense_feature_columns = [DenseFeat(feat, 1)
                      for feat in dense_features]

In [8]:
dnn_feature_columns = sparse_feature_columns + dense_feature_columns
linear_feature_columns = sparse_feature_columns + dense_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)


In [None]:
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name] for name in feature_names}

test_model_input = {name:test[name] for name in feature_names}


device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns,dnn_feature_columns,task='binary',device=device)
model.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
pred_ans = model.predict(test_model_input, batch_size=256)

In [1]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.models import *
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
import torch

if __name__ == "__main__":
    data = pd.read_csv('./criteo_sample.txt')

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                              for feat in sparse_features] + [DenseFeat(feat, 1,)
                                                              for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = {name:train[name] for name in feature_names}
    test_model_input = {name:test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate

    device = 'cpu'
    use_cuda = False
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

    model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns, task='binary',
                   l2_reg_embedding=1e-5, device=device)

    model.compile("adagrad", "binary_crossentropy",
                  metrics=["binary_crossentropy", "auc"],)
    model.fit(train_model_input, train[target].values,
              batch_size=256, epochs=10, validation_split=0.2, verbose=2)

    pred_ans = model.predict(test_model_input, 256)
    print("")
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cpu
Train on 128 samples, validate on 32 samples, 1 steps per epoch
Epoch 1/10
1s - loss:  0.6975 - binary_crossentropy:  0.6975 - auc:  0.4355 - val_binary_crossentropy:  0.6656 - val_auc:  0.4286
Epoch 2/10
0s - loss:  0.6088 - binary_crossentropy:  0.6088 - auc:  0.9538 - val_binary_crossentropy:  0.6477 - val_auc:  0.5801
Epoch 3/10
0s - loss:  0.4963 - binary_crossentropy:  0.4963 - auc:  0.9993 - val_binary_crossentropy:  0.6503 - val_auc:  0.5498
Epoch 4/10
0s - loss:  0.4042 - binary_crossentropy:  0.4042 - auc:  0.9993 - val_binary_crossentropy:  0.6705 - val_auc:  0.5238
Epoch 5/10
0s - loss:  0.3120 - binary_crossentropy:  0.3120 - auc:  0.9997 - val_binary_crossentropy:  0.7140 - val_auc:  0.5195
Epoch 6/10
0s - loss:  0.2245 - binary_crossentropy:  0.2245 - auc:  1.0000 - val_binary_crossentropy:  0.7790 - val_auc:  0.5411
Epoch 7/10
0s - loss:  0.1524 - binary_crossentropy:  0.1524 - auc:  1.0000 - val_binary_crossentropy:  0.8525 - val_auc:  0.5455
Epoch 8/10
0s - loss: 

In [11]:
list(range(76))
data.columns

Index([      0,       1,       2,       3,       4,       5,       6,       7,
             8,       9,      10,      11,      12,      13,      14,      15,
            16,      17,      18,      19,      20,      21,      22,      23,
            24,      25,      26,      27,      28,      29,      30,      31,
            32,      33,      34,      35,      36,      37,      38,      39,
            40,      41,      42,      43,      44,      45,      46,      47,
            48,      49,      50,      51,      52,      53,      54,      55,
            56,      57,      58,      59,      60,      61,      62,      63,
            64,      65,      66,      67,      68,      69,      70,      71,
            72,      73,      74,      75, 'label'],
      dtype='object')

In [12]:
data = pd.read_pickle('./df_train_76d_full.pkl')
dense_features =[0,       1,       2,       3,       4,       5,       6,       7,
             8,       9,      10,      11,      12,      13,      14,      15,
            16,      17,      18,      19,      20,      21,      22,      23,
            24,      25,      26,      27,      28,      29,      30,      31,
            32,      33,      34,      35,      36,      37,      38,      39,
            40,      41,      42,      43,      44,      45,      46,      47,
            48,      49,      50,      51,      52,      53,      54,      55,
            56,      57,      58,      59,      60,      61,      62,      63,
            64,      65,      66,      67,      68,      69,      70,      71,
            72,      73,      74,      75,]
data[dense_features] = data[dense_features].fillna(0, )

In [16]:
data_train = pd.read_pickle('./df_train_76d_full.pkl')
data_test = pd.read_pickle('./df_test_76d_full.pkl')
data_val = pd.read_pickle('./df_val_76d_full.pkl')
data=pd.concat([data_train, data_test,data_val], axis=0, ignore_index=True)
data.to_pickle("./df_76d_full_50942.pkl")

In [None]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.models import *
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
import torch

if __name__ == "__main__":
    data = pd.read_pickle('./df_76d_full_50942.pkl')
    sparse_features = []
    dense_features =[0,       1,       2,       3,       4,       5,       6,       7,
             8,       9,      10,      11,      12,      13,      14,      15,
            16,      17,      18,      19,      20,      21,      22,      23,
            24,      25,      26,      27,      28,      29,      30,      31,
            32,      33,      34,      35,      36,      37,      38,      39,
            40,      41,      42,      43,      44,      45,      46,      47,
            48,      49,      50,      51,      52,      53,      54,      55,
            56,      57,      58,      59,      60,      61,      62,      63,
            64,      65,      66,      67,      68,      69,      70,      71,
            72,      73,      74,      75,]
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                              for feat in sparse_features] + [DenseFeat(feat, 1,)
                                                              for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = {name:train[name] for name in feature_names}
    test_model_input = {name:test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate

    device = 'cpu'
    use_cuda = False
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

    model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns, task='binary',
                   l2_reg_embedding=1e-5, device=device)

    model.compile("adagrad", "binary_crossentropy",
                  metrics=["binary_crossentropy", "auc"],)
    model.fit(train_model_input, train[target].values,
              batch_size=256, epochs=100, validation_split=0.2, verbose=1)

    pred_ans = model.predict(test_model_input, 256)
    print("")
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

In [25]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.models import *
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
import torch

if __name__ == "__main__":
    data = pd.read_pickle('./df_76d_full_50942.pkl')
    sparse_features = []
    dense_features =[0,       1,       2,       3,       4,       5,       6,       7,
             8,       9,      10,      11,      12,      13,      14,      15,
            16,      17,      18,      19,      20,      21,      22,      23,
            24,      25,      26,      27,      28,      29,      30,      31,
            32,      33,      34,      35,      36,      37,      38,      39,
            40,      41,      42,      43,      44,      45,      46,      47,
            48,      49,      50,      51,      52,      53,      54,      55,
            56,      57,      58,      59,      60,      61,      62,      63,
            64,      65,      66,      67,      68,      69,      70,      71,
            72,      73,      74,      75,]
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                              for feat in sparse_features] + [DenseFeat(feat, 1,)
                                                              for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = {name:train[name] for name in feature_names}
    test_model_input = {name:test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate

    device = 'cpu'
    use_cuda = False
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

    model = DCN(dnn_feature_columns=dnn_feature_columns, task='binary',
                   l2_reg_embedding=1e-5, device=device)

    model.compile("adagrad", "binary_crossentropy",
                  metrics=["binary_crossentropy", "auc"],)
    model.fit(train_model_input, train[target].values,
              batch_size=256, epochs=100, validation_split=0.2, verbose=1)

    pred_ans = model.predict(test_model_input, 256)
    print("")
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

0it [00:00, ?it/s]

cpu
Train on 32602 samples, validate on 8151 samples, 128 steps per epoch


44it [00:05,  7.39it/s]


KeyboardInterrupt: 

In [22]:
import numpy as np

from deepctr_torch.models import DIN
from deepctr_torch.inputs import SparseFeat,VarLenSparseFeat,DenseFeat,get_feature_names


def get_xy_fd():

    feature_columns = [SparseFeat('user',3),SparseFeat(
        'gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1),DenseFeat('score', 1)]
    feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score}
    x = {name:feature_dict[name] for name in get_feature_names(feature_columns)}
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list


if __name__ == "__main__":
    x, y, feature_columns, behavior_feature_list = get_xy_fd()
    model = DIN(feature_columns, behavior_feature_list, hist_len_max=4, )
    model.compile('adam', 'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)

cpu
Train on 128 samples, validate on 32 samples, 1 steps per epoch
Epoch 1/10
0s - loss:  0.6996 - binary_crossentropy:  0.6996 - val_binary_crossentropy:  0.6909
Epoch 2/10
0s - loss:  0.6857 - binary_crossentropy:  0.6857 - val_binary_crossentropy:  0.6812
Epoch 3/10
0s - loss:  0.6712 - binary_crossentropy:  0.6712 - val_binary_crossentropy:  0.6722
Epoch 4/10
0s - loss:  0.6574 - binary_crossentropy:  0.6574 - val_binary_crossentropy:  0.6635
Epoch 5/10
0s - loss:  0.6438 - binary_crossentropy:  0.6438 - val_binary_crossentropy:  0.6550
Epoch 6/10
0s - loss:  0.6301 - binary_crossentropy:  0.6301 - val_binary_crossentropy:  0.6467
Epoch 7/10
0s - loss:  0.6165 - binary_crossentropy:  0.6165 - val_binary_crossentropy:  0.6388
Epoch 8/10
0s - loss:  0.6033 - binary_crossentropy:  0.6033 - val_binary_crossentropy:  0.6314
Epoch 9/10
0s - loss:  0.5904 - binary_crossentropy:  0.5904 - val_binary_crossentropy:  0.6243
Epoch 10/10
0s - loss:  0.5777 - binary_crossentropy:  0.5777 - val_