In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import feature_column
from tensorflow.keras.layers import Embedding, Input, Dot, Dense, Flatten, Multiply, Concatenate, DenseFeatures
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adagrad, Adam, SGD, RMSprop
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
import datetime

In [2]:
def preprocess_data(data_path):
    header_data = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv(data_path, sep='\t', names=header_data)
    data = data.drop("timestamp", axis=1).astype(int)
    
    num_items = data["item_id"].max()
    num_users = data["user_id"].max()
    
    y = data["rating"]
    y = np.where(y==5, 1, 0)

    X = data.drop("rating", axis=1)
#     X = X.astype('category')
    
    return X, y, num_items, num_users

In [3]:
# Read data
# User data
header_user = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
data_user = pd.read_csv('data/u.user', sep='|', names=header_user)
data_user = data_user.drop(['zip_code'], axis=1)

# Item data
header_item = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
        'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
        'Thriller', 'War', 'Western']
data_item = pd.read_csv('data/u.item', sep='|', names=header_item, encoding = "ISO-8859-1")
data_item = data_item.drop(['release_date', 'title', 'video_release_date', 'IMDb_URL'], axis=1)

# User Item interaction data
X_train, y_train, num_items, num_users = preprocess_data("data/ub.base")
X_test , y_test, _, _ = preprocess_data("data/ub.test")

In [4]:
# Data merge and label separation
X_train = X_train.merge(data_user, how="left", left_on="user_id", right_on="user_id", suffixes=(False, False))
X_train = X_train.merge(data_item, left_on="item_id", right_on="item_id")
X_test = X_test.merge(data_user, left_on="user_id", right_on="user_id")
X_test = X_test.merge(data_item, left_on="item_id", right_on="item_id") 

In [5]:
X_train.head()

Unnamed: 0,user_id,item_id,age,gender,occupation,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,24,M,technician,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,1,53,F,other,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,5,1,33,F,other,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,6,1,42,M,executive,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,10,1,53,M,lawyer,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(x, y, shuffle=True, batch_size=32):
  x = x.copy()
  ds = tf.data.Dataset.from_tensor_slices((dict(x), y))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(x))
  ds = ds.batch(batch_size)
  return ds

In [7]:
batch_size = 32 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(X_train, y_train, batch_size=batch_size)
val_ds = df_to_dataset(X_test, y_test, shuffle=False, batch_size=batch_size)

In [8]:
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['age'])
  print('A batch of user_id:', feature_batch['user_id'])
  print('A batch of occupations:', feature_batch['occupation'])
  print('A batch of Animation:', feature_batch['Animation'])
  print('A batch of targets:', label_batch )

Every feature: ['user_id', 'item_id', 'age', 'gender', 'occupation', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
A batch of ages: tf.Tensor(
[23 70 35 29 29 48 33 50 35 19 23 60 33 19 13 23 51 21 20 35 20 35 34 32
 48 29 29 21 40 23 22 20], shape=(32,), dtype=int64)
A batch of user_id: tf.Tensor(
[  3 860 256 365 768 463 381 909 378  68 838 694 387 110 674 305 548 198
 773 450  99 256 833 796 488 222 109 671 200 305 405 886], shape=(32,), dtype=int64)
A batch of occupations: tf.Tensor(
[b'writer' b'retired' b'none' b'lawyer' b'administrator' b'healthcare'
 b'artist' b'educator' b'student' b'student' b'student' b'programmer'
 b'entertainment' b'student' b'student' b'programmer' b'writer' b'student'
 b'student' b'educator' b'student' b'none' b'writer' b'writer'
 b'technician' b'programmer' b'other' b'programmer' b'programmer'

# Model

In [9]:
# Dense inputs
inputs = {
    'age': tf.keras.layers.Input(name='age', shape=(), dtype='int8'),
    'user_id': tf.keras.layers.Input(name='user_id', shape=(), dtype='int8'),
    'item_id': tf.keras.layers.Input(name='item_id', shape=(), dtype='int8')
    
}
inputs.update({
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype='int8') 
          for colname in ['unknown','Action', 'Adventure', 'Animation', 'Children', 
                                    'Comedy', 'Crime','Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                                    'Horror', 'Musical','Mystery', 'Romance', 'Sci-Fi', 'Thriller', 
                                    'War', 'Western']
})

# Sparse inputs
inputs.update({
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype='string') 
          for colname in [ 'occupation', 'gender'] 
    
})

In [10]:
# Dense features
features = {
    'age' : tf.feature_column.numeric_column('age'),
    'gender': feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list
                ('gender', vocabulary_list=['M', 'F']))
}
features.update(
            {
                categorical_feature: tf.feature_column.indicator_column(
                    tf.feature_column.categorical_column_with_identity(categorical_feature,num_buckets=2))
                    for categorical_feature in ['unknown','Action', 'Adventure', 'Animation', 'Children', 
                                    'Comedy', 'Crime','Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                                    'Horror', 'Musical','Mystery', 'Romance', 'Sci-Fi', 'Thriller', 
                                    'War', 'Western']
            }
)
# Sparse features
features.update({

            'embedding_occupation': tf.feature_column.embedding_column(
                tf.feature_column.categorical_column_with_vocabulary_list(
                'occupation', vocabulary_list=[
                    'technician', 'executive', 'administrator', 'programmer',
                    'marketing', 'student', 'artist', 'engineer', 'librarian',
                    'educator', 'other', 'scientist', 'homemaker', 'salesman',
                    'healthcare', 'entertainment', 'retired', 'writer', 'none',
                    'lawyer', 'doctor']), 10),
            'embedding_user_id':tf.feature_column.embedding_column(
                feature_column.categorical_column_with_identity(
                'user_id', num_buckets=num_users), 10),
            'embedding_item_id':tf.feature_column.embedding_column(
                feature_column.categorical_column_with_identity(
                'item_id', num_buckets=num_items), 10)
})

In [11]:
def cross_layer(x0, xl):
    """
    实现一层cross layer
    @param x0: 特征embeddings
    @param xl: 前一层的输出结果
    """
    # 1.获取xl层的embedding size
    embed_dim = xl.shape[-1]
    # 2.初始化当前层的W和b
    w = tf.Variable(tf.random.truncated_normal(shape=(embed_dim,), stddev=0.01))
    b = tf.Variable(tf.zeros(shape=(embed_dim,)))
    # 3.计算feature crossing
    # 下面的reshape操作相当于将列向量转换为行向量
    x1_T = tf.reshape(xl, [-1, 1, embed_dim])
    # 行向量与列向量的乘积结果是一个标量
    x_lw = tf.tensordot(x1_T, w, axes=1)
    cross = x0 * x_lw 
    return cross + b + xl

In [12]:
DNN_HIDDEN_UNITS = '64,32'
NUM_CROSS_LAYERS = 3

In [13]:
# Build a Deep&Cross model.
def deep_and_cross_classifier(inputs, features, dnn_hidden_units, num_cross_layers):
    features = tf.keras.layers.DenseFeatures(features, name='inputs')(inputs)
    
    # Deep
    layers = [int(x) for x in dnn_hidden_units.split(',')]
    for layerno, numnodes in enumerate(layers):
        deep = tf.keras.layers.Dense(numnodes, activation='relu', name='dnn_{}'.format(layerno+1))(features)        

    # Cross
    # 初始化xl为x0
    cross = features
    for i in range(num_cross_layers):
        cross = cross_layer(features, cross)
    
    # Concatenate Deep and Cross
    both = tf.keras.layers.concatenate([deep, cross], name='both')

    output = tf.keras.layers.Dense(1, activation='sigmoid', name='pred')(both)
    model = tf.keras.Model(inputs, output)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [14]:
model = deep_and_cross_classifier(
    inputs,
    features = features.values(),
    dnn_hidden_units = DNN_HIDDEN_UNITS,
    num_cross_layers = NUM_CROSS_LAYERS
)

In [None]:
tf.keras.utils.plot_model(model, 'deep_and_cross.png', show_shapes=False, rankdir='LR')

In [15]:
# Clear any logs from previous runs
!rm -rf ./logs/ 

In [16]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

In [17]:
history = model.fit(train_ds,
                    validation_data=val_ds,
                    epochs=10, 
                    steps_per_epoch=500,
                    callbacks = [tensorboard_callback]
                   )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [19]:
!tensorboard --logdir logs/fit

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.2.2 at http://localhost:6006/ (Press CTRL+C to quit)
^C
