### Classification: Criteo
In this example,we simply normailize the dense feature between 0 and 1,you can try other transformation technique like log normalization or discretization.Then we use SparseFeat and DenseFeat to generate feature columns for sparse features and dense features

In [2]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.models import *
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

In [15]:
data = pd.read_csv('./criteo_sample.txt')

sparse_features = ['C' + str(i) for i in range(1,27)]
dense_features = ['I' + str(i) for i in range(1,14)]
print("sparse_features len:",len(sparse_features))
print("dense_features len:",len(dense_features))
print(len(data))

sparse_features len: 26
dense_features len: 13
200


In [16]:
print(data[0:4])

   label   I1  I2     I3    I4       I5     I6   I7    I8     I9  ...  \
0      0  0.0   3  260.0   NaN  17668.0    NaN  NaN  33.0    NaN  ...   
1      0  NaN  -1   19.0  35.0  30251.0  247.0  1.0  35.0  160.0  ...   
2      0  0.0   0    2.0  12.0   2013.0  164.0  6.0  35.0  523.0  ...   
3      0  NaN  13    1.0   4.0  16836.0  200.0  5.0   4.0   29.0  ...   

        C17       C18  C19  C20       C21  C22       C23       C24  C25  C26  
0  e5ba7672  87c6f83c  NaN  NaN  0429f84b  NaN  3a171ecb  c0d61a5c  NaN  NaN  
1  d4bb7bd8  6fc84bfb  NaN  NaN  5155d8a3  NaN  be7c41b4  ded4aac9  NaN  NaN  
2  e5ba7672  675c9258  NaN  NaN  2e01979f  NaN  bcdee96c  6d5d1302  NaN  NaN  
3  e5ba7672  52e44668  NaN  NaN  e587c466  NaN  32c7478e  3b183c5c  NaN  NaN  

[4 rows x 40 columns]


In [19]:
data[sparse_features] = data[sparse_features].fillna('-1')
data[dense_features] = data[dense_features].fillna(0)
target = data['label']

print(len(data))

200


In [21]:
# print(data[0:4])
print(data[dense_features][0:4])

    I1  I2     I3    I4       I5     I6   I7    I8     I9  I10  I11  I12   I13
0  0.0   3  260.0   0.0  17668.0    0.0  0.0  33.0    0.0  0.0  0.0  0.0   0.0
1  0.0  -1   19.0  35.0  30251.0  247.0  1.0  35.0  160.0  0.0  1.0  0.0  35.0
2  0.0   0    2.0  12.0   2013.0  164.0  6.0  35.0  523.0  0.0  3.0  0.0  18.0
3  0.0  13    1.0   4.0  16836.0  200.0  5.0   4.0   29.0  0.0  2.0  0.0   4.0


**1.Label Encoding for sparse features,and do simple Transformation for dense features**

In [None]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

**2.count #unique features for each sparse field,and record dense feature field name**

In [None]:
print(sparse_features)

In [None]:
fixlen_feature_columns=[]
for i,feat in enumerate(sparse_features):
    sf = SparseFeat(feat, vocabulary_size=len(data[feat].unique()),embedding_dim=4)
    fixlen_feature_columns.append(sf)
    print(i,feat,len(data[feat].unique()), sf)


In [None]:
for feat in dense_features:
    df = DenseFeat(feat, dimension=1)
    fixlen_feature_columns.append(df)
    print(feat,len(data[feat].unique()), df)

In [None]:
print(len(fixlen_feature_columns))

In [None]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

In [None]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
# feature_names = get_feature_names(fixlen_feature_columns)

In [None]:
print(feature_names)

**3.generate input data for model**

In [None]:
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

**4. Define Model, train, predict and evaluate**

In [None]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')

In [None]:
model.compile("adam","binary_crossentropy",metrics=['binary_crossentropy'],)

In [None]:
history=model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2,
                 validation_split=0.2,)

In [None]:
pred_ans=model.predict(test_model_input, batch_size=256)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

In [22]:
print(sparse_features)

['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']


In [50]:
fixlen_feature_columns=[]
for i,feat in enumerate(sparse_features):
    sf = SparseFeat(feat, vocabulary_size=len(data[feat].unique()),embedding_dim=4)
    fixlen_feature_columns.append(sf)
    print(i,feat,len(data[feat].unique()), sf)


0 C1 27 SparseFeat(name='C1', vocabulary_size=27, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x1a4110bb90>, embedding_name='C1', group_name='default_group', trainable=True)
1 C2 92 SparseFeat(name='C2', vocabulary_size=92, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x1a4110bb10>, embedding_name='C2', group_name='default_group', trainable=True)
2 C3 172 SparseFeat(name='C3', vocabulary_size=172, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x1a41101c10>, embedding_name='C3', group_name='default_group', trainable=True)
3 C4 157 SparseFeat(name='C4', vocabulary_size=157, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x1a40fe5c50>, embedding_nam

In [51]:
for feat in dense_features:
    df = DenseFeat(feat, dimension=1)
    fixlen_feature_columns.append(df)
    print(feat,len(data[feat].unique()), df)

I1 14 DenseFeat(name='I1', dimension=1, dtype='float32')
I2 68 DenseFeat(name='I2', dimension=1, dtype='float32')
I3 55 DenseFeat(name='I3', dimension=1, dtype='float32')
I4 35 DenseFeat(name='I4', dimension=1, dtype='float32')
I5 172 DenseFeat(name='I5', dimension=1, dtype='float32')
I6 92 DenseFeat(name='I6', dimension=1, dtype='float32')
I7 42 DenseFeat(name='I7', dimension=1, dtype='float32')
I8 41 DenseFeat(name='I8', dimension=1, dtype='float32')
I9 113 DenseFeat(name='I9', dimension=1, dtype='float32')
I10 4 DenseFeat(name='I10', dimension=1, dtype='float32')
I11 15 DenseFeat(name='I11', dimension=1, dtype='float32')
I12 5 DenseFeat(name='I12', dimension=1, dtype='float32')
I13 43 DenseFeat(name='I13', dimension=1, dtype='float32')


In [52]:
print(len(fixlen_feature_columns))

39


In [53]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

In [54]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
# feature_names = get_feature_names(fixlen_feature_columns)

In [55]:
print(feature_names)

['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']


**3.generate input data for model**

In [56]:
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

**4. Define Model, train, predict and evaluate**

In [57]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')

In [58]:
model.compile("adam","binary_crossentropy",metrics=['binary_crossentropy'],)

In [59]:
history=model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2,
                 validation_split=0.2,)



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 128 samples, validate on 32 samples
Epoch 1/10
128/128 - 6s - loss: 0.7033 - binary_crossentropy: 0.7033 - val_loss: 0.7107 - val_binary_crossentropy: 0.7107
Epoch 2/10
128/128 - 0s - loss: 0.6861 - binary_crossentropy: 0.6861 - val_loss: 0.6942 - val_binary_crossentropy: 0.6942
Epoch 3/10
128/128 - 0s - loss: 0.6695 - binary_crossentropy: 0.6695 - val_loss: 0.6778 - val_binary_crossentropy: 0.6778
Epoch 4/10
128/128 - 0s - loss: 0.6532 - binary_crossentropy: 0.6532 - val_loss: 0.6614 - val_binary_crossentropy: 0.6614
Epoch 5/10
128/128 - 0s - loss: 0.6373 - binary_crossentropy: 0.6373 - val_loss: 0.6451 - val_binary_crossentropy: 0.6451
Epoch 6/10
128/128 - 0s - loss: 0.6216 - binary_crossentropy: 0.6216 - val_loss: 0.6290 - val_binary_crossentropy: 0.6290
Epoch 7/10
128/128 - 0s - loss: 0.6061 - binary_crossentropy: 0.6061 - val_loss: 0.6130 - val_binary_crossentropy: 0.6130
Epoch 8/10
128/128 - 0s - loss: 0.5907 - binary_crossentropy: 0.5906 - val_loss: 0.5973 - val_binary_

In [60]:
pred_ans=model.predict(test_model_input, batch_size=256)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

test LogLoss 0.5482
test AUC 0.6061
