### Step 1: Import model

In [37]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names
import numpy as np

data = pd.read_csv('./criteo_sample.txt')

sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I'+str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0,)
target = ['label']

In [38]:
print(sparse_features)

['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']


In [39]:
print(dense_features)

['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']


In [40]:
print(np.shape(data))
print(data[0:10])

(200, 40)
   label    I1   I2     I3    I4       I5     I6    I7    I8     I9  ...  \
0      0   0.0    3  260.0   0.0  17668.0    0.0   0.0  33.0    0.0  ...   
1      0   0.0   -1   19.0  35.0  30251.0  247.0   1.0  35.0  160.0  ...   
2      0   0.0    0    2.0  12.0   2013.0  164.0   6.0  35.0  523.0  ...   
3      0   0.0   13    1.0   4.0  16836.0  200.0   5.0   4.0   29.0  ...   
4      0   0.0    0  104.0  27.0   1990.0  142.0   4.0  32.0   37.0  ...   
5      0   0.0   -1   63.0  40.0   1470.0   61.0   4.0  37.0   46.0  ...   
6      0   0.0  370    4.0   1.0   1787.0   65.0  14.0  25.0  489.0  ...   
7      1  19.0   10   30.0  10.0      1.0    3.0  33.0  47.0  126.0  ...   
8      0   0.0    0   36.0  22.0   4684.0  217.0   9.0  35.0  135.0  ...   
9      0   2.0   11    8.0  23.0     30.0   11.0   2.0   8.0   23.0  ...   

        C17       C18       C19       C20       C21 C22       C23       C24  \
0  e5ba7672  87c6f83c        -1        -1  0429f84b  -1  3a171ecb  c0d61a5

### Step 2: Simple preprocessing
**Label Encoding: map the features to integer value from 0 ~ len(#unique) - 1**

In [41]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

**Hash Encoding: map the features to a fix range,like 0 ~ 9999.We have 2 methods to do that:**

Do feature hashing before training

In [16]:
for feat in sparse_features:
    lbe = HashEncoder()
    data[feat] = lbe.transform(data[feat])

NameError: name 'HashEncoder' is not defined

Do feature hashing on the fly in training process

We can do feature hashing by setting use_hash=True in SparseFeat or VarlenSparseFeat in Step3.

**And for dense numerical features,they are usually discretized to buckets,here we use normalization.**

In [42]:
mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])

### Step 3: Generate feature columns

For sparse features, we transform them into dense vectors by embedding techniques. For dense numerical features, we concatenate them to the input tensors of fully connected layer.

And for varlen(multi-valued) sparse features,you can use VarlenSparseFeat. Visit examples of using VarlenSparseFeat

**Label Encoding**

In [43]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

**Feature Hashing on the fly**

In [26]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=1e6,embedding_dim=4, use_hash=True, dtype='string')  # since the input is string
                              for feat in sparse_features] + [DenseFeat(feat, 1, )
                          for feat in dense_features]

In [30]:
print(fixlen_feature_columns)

[SparseFeat(name='C1', vocabulary_size=27, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x1a3e751a50>, embedding_name='C1', group_name='default_group', trainable=True), SparseFeat(name='C2', vocabulary_size=92, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x1a3e751b10>, embedding_name='C2', group_name='default_group', trainable=True), SparseFeat(name='C3', vocabulary_size=172, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x1a3eafb310>, embedding_name='C3', group_name='default_group', trainable=True), SparseFeat(name='C4', vocabulary_size=157, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x1a3eafbc90>, embedding_name='C4', group_name='default_gr

**generate feature columns**

In [44]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [45]:
print(feature_names)

['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']


### Step 4: Generate the training samples and train the model

In [46]:
train, test = train_test_split(data, test_size=0.2)

train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}


model = DeepFM(linear_feature_columns, dnn_feature_columns,task='binary')
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )

history = model.fit(train_model_input, train[target].values, 
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
pred_ans = model.predict(test_model_input, batch_size=256)

Train on 128 samples, validate on 32 samples
Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


128/128 - 9s - loss: 0.6810 - binary_crossentropy: 0.6810 - val_loss: 0.6698 - val_binary_crossentropy: 0.6698
Epoch 2/10
128/128 - 0s - loss: 0.6630 - binary_crossentropy: 0.6630 - val_loss: 0.6657 - val_binary_crossentropy: 0.6657
Epoch 3/10
128/128 - 0s - loss: 0.6462 - binary_crossentropy: 0.6462 - val_loss: 0.6623 - val_binary_crossentropy: 0.6622
Epoch 4/10
128/128 - 0s - loss: 0.6303 - binary_crossentropy: 0.6303 - val_loss: 0.6594 - val_binary_crossentropy: 0.6593
Epoch 5/10
128/128 - 0s - loss: 0.6152 - binary_crossentropy: 0.6152 - val_loss: 0.6570 - val_binary_crossentropy: 0.6570
Epoch 6/10
128/128 - 0s - loss: 0.6006 - binary_crossentropy: 0.6005 - val_loss: 0.6550 - val_binary_crossentropy: 0.6549
Epoch 7/10
128/128 - 0s - loss: 0.5864 - binary_crossentropy: 0.5864 - val_loss: 0.6532 - val_binary_crossentropy: 0.6532
Epoch 8/10
128/128 - 0s - loss: 0.5725 - binary_crossentropy: 0.5724 - val_loss: 0.6518 - val_binary_crossentropy: 0.6518
Epoch 9/10
128/128 - 0s - loss: 0.5

In [35]:
print(test_model_input)

{'C1': array([16,  0, 11, 11, 21,  0,  0, 16, 20,  0, 11, 25,  9, 11,  0, 11, 18,
        0,  0, 18,  0,  1,  0,  0,  0, 11, 12,  0, 11, 16,  9, 21,  0,  0,
       16,  0, 16, 11,  0,  0]), 'C2': array([ 5, 30, 84,  5, 89,  8, 33,  7, 54, 72, 13, 50, 15, 85, 30, 10, 10,
        6, 11,  5, 31, 30, 38, 14, 72, 24, 13,  4, 16, 82, 39, 66,  5, 88,
       53, 87,  5, 79, 46, 30]), 'C3': array([114, 165, 104,  62,  81,  70,  23, 141, 120, 127,  36, 160,   9,
       171,  61,  38,  93,  82,  59,   5, 101,  10, 103, 168,  89, 159,
       126,  96,  15,  49, 121,  51,  54, 122,   0,  87, 167, 164, 156,
        74]), 'C4': array([ 76, 147,  71,  16,  21,  37,  78,  85, 121, 152,  18,   6,  47,
        80, 136,  89,  33, 128,  77, 123, 137,  44,  92,   6, 132,   4,
       124, 146,  28,  14, 128,  24, 119,  43,   0, 115,  86,  23,  76,
        64]), 'C5': array([1, 1, 1, 1, 6, 6, 1, 1, 3, 1, 1, 1, 6, 1, 9, 5, 1, 5, 1, 1, 0, 6,
       5, 1, 5, 1, 1, 1, 6, 1, 6, 1, 1, 1, 1, 1, 1, 1, 6, 6]), 'C6': a