In [1]:
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names


import pandas as pd
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score

data_path = '../data/ctr-criteo-small-data/train_1m.txt'
# 数据说明：https://www.kaggle.com/competitions/criteo-display-ad-challenge/data

# 数据读取

In [2]:
data = pd.read_csv(data_path, sep='\t', header=None)
print(data.shape)

(1000000, 40)


In [3]:
data = data.sample(frac=0.5) # 小取样，跑通

In [4]:
# 规定
dense_features = ['I'+str(i+1) for i in range(13)]
sparse_features = ['C'+str(i+1) for i in range(26)]
cols = ['label'] + dense_features + sparse_features
# 其中C1-C26是Category的特征(sparse_features)，而I1-I13是连续的特征(dense_features)。
data.columns = cols

# 数据处理

In [5]:
# 因为有缺失的值，我们把Category的缺失值设置为”-1”，而把连续的缺失值设置为0。
data[sparse_features] = data[sparse_features].fillna('-1')
data[dense_features] = data[dense_features].fillna(0)

In [6]:
# 类别型的encoder
for col in sparse_features:
    enc = LabelEncoder()
    data[col] = enc.fit_transform(data[col])

In [7]:
# 连续值：缩放到0-1范围
mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])

# 构造deepFM的emb层

In [8]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1,embedding_dim=4) 
                          for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)for feat in dense_features]

In [24]:
for i in fixlen_feature_columns:
    print(i)
    print('\n')

SparseFeat(name='C1', vocabulary_size=1048, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fcba4ad7e90>, embedding_name='C1', group_name='default_group', trainable=True)


SparseFeat(name='C2', vocabulary_size=526, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fcba4ad7410>, embedding_name='C2', group_name='default_group', trainable=True)


SparseFeat(name='C3', vocabulary_size=176296, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fcba4ad77d0>, embedding_name='C3', group_name='default_group', trainable=True)


SparseFeat(name='C4', vocabulary_size=74497, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype

In [22]:
type(fixlen_feature_columns[0])

deepctr.feature_column.SparseFeat

In [10]:
# 线性部分以及DNN部分
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 模型构造

In [11]:
train, test = train_test_split(data, test_size=0.2)

In [12]:
target = ['label',]
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

model = DeepFM(linear_feature_columns,dnn_feature_columns,task='binary') # 规定了linear、dnn的输入
model.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'], ) # 二分类、交叉熵损失函数

# 模型训练10轮

In [13]:
history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10
1250/1250 - 50s - loss: 0.4943 - binary_crossentropy: 0.4903 - val_loss: 0.4851 - val_binary_crossentropy: 0.4768
Epoch 2/10
1250/1250 - 43s - loss: 0.3876 - binary_crossentropy: 0.3744 - val_loss: 0.5615 - val_binary_crossentropy: 0.5447
Epoch 3/10
1250/1250 - 43s - loss: 0.3404 - binary_crossentropy: 0.3216 - val_loss: 0.5951 - val_binary_crossentropy: 0.5753
Epoch 4/10
1250/1250 - 43s - loss: 0.2819 - binary_crossentropy: 0.2634 - val_loss: 0.7555 - val_binary_crossentropy: 0.7376
Epoch 5/10
1250/1250 - 43s - loss: 0.2471 - binary_crossentropy: 0.2300 - val_loss: 0.8690 - val_binary_crossentropy: 0.8519
Epoch 6/10
1250/1250 - 43s - loss: 0.2235 - binary_crossentropy: 0.2068 - val_loss: 0.8518 - val_binary_crossentropy: 0.8347
Epoch 7/10
1250/1250 - 43s - loss: 0.2058 - binary_crossentropy: 0.1890 - val_loss: 0.9306 - val_binary_crossentropy: 0.9131
Epoch 8/10
1250/1250 - 43s - loss: 0.1952 - binary_crossentropy: 0.1778 - val_loss: 0.8600 - val_binary_crossentropy: 0.8417


In [14]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [15]:
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

test LogLoss nan
test AUC 0.69


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


# 再进行30轮训练

In [16]:
history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=30, verbose=2, validation_split=0.2, )

Epoch 1/30
1250/1250 - 43s - loss: 0.1697 - binary_crossentropy: 0.1505 - val_loss: 0.9736 - val_binary_crossentropy: 0.9537
Epoch 2/30
1250/1250 - 42s - loss: 0.1649 - binary_crossentropy: 0.1454 - val_loss: 0.9228 - val_binary_crossentropy: 0.9025
Epoch 3/30
1250/1250 - 41s - loss: 0.1584 - binary_crossentropy: 0.1386 - val_loss: 0.9576 - val_binary_crossentropy: 0.9372
Epoch 4/30
1250/1250 - 42s - loss: 0.1536 - binary_crossentropy: 0.1337 - val_loss: 0.9838 - val_binary_crossentropy: 0.9633
Epoch 5/30
1250/1250 - 43s - loss: 0.1482 - binary_crossentropy: 0.1283 - val_loss: 0.9920 - val_binary_crossentropy: 0.9717
Epoch 6/30
1250/1250 - 43s - loss: 0.1445 - binary_crossentropy: 0.1246 - val_loss: 1.0744 - val_binary_crossentropy: 1.0540
Epoch 7/30
1250/1250 - 44s - loss: 0.1413 - binary_crossentropy: 0.1214 - val_loss: 1.0725 - val_binary_crossentropy: 1.0520
Epoch 8/30
1250/1250 - 46s - loss: 0.1391 - binary_crossentropy: 0.1191 - val_loss: 1.0594 - val_binary_crossentropy: 1.0387


In [17]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [18]:
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

test LogLoss nan
test AUC 0.675


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


In [19]:
# 出现过拟合情况