In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import feature_column
from tensorflow.keras.layers import Embedding, Input, Dot, Dense, Flatten, Multiply, Concatenate, DenseFeatures
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adagrad, Adam, SGD, RMSprop
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
import datetime

# Preprocess data

In [35]:
def preprocess_data(data_path):
    header_data = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv(data_path, sep='\t', names=header_data)
    data = data.drop("timestamp", axis=1).astype(int)
    
    num_items = data["item_id"].max()
    num_users = data["user_id"].max()
    
    y = data["rating"]
    y = np.where(y==5, 1, 0)

    X = data.drop("rating", axis=1)
#     # Minus 1 so the index starts from 0
#     X = X-1
    X = X.astype('category')
    
    return X, y, num_items, num_users

In [36]:
# Read data
# User data
header_user = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
data_user = pd.read_csv('data/u.user', sep='|', names=header_user)
data_user = data_user.drop(['zip_code'], axis=1)

# Item data
header_item = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
        'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
        'Thriller', 'War', 'Western']
data_item = pd.read_csv('data/u.item', sep='|', names=header_item, encoding = "ISO-8859-1")
data_item = data_item.drop(['release_date', 'title', 'video_release_date', 'IMDb_URL'], axis=1)

# User Item interaction data
X_train, y_train, num_items, num_users = preprocess_data("data/ub.base")
X_test , y_test, _, _ = preprocess_data("data/ub.test")


In [37]:
# Data merge and label separation
X_train = X_train.merge(data_user, how="left", left_on="user_id", right_on="user_id", suffixes=(False, False))
X_train = X_train.merge(data_item, left_on="item_id", right_on="item_id")
X_test = X_test.merge(data_user, left_on="user_id", right_on="user_id")
X_test = X_test.merge(data_item, left_on="item_id", right_on="item_id")
# X_train = X_train.drop(["user_id", "item_id"], axis=1)
# X_test = X_test.drop(["user_id", "item_id"], axis=1)

In [38]:
X_train.head()

Unnamed: 0,user_id,item_id,age,gender,occupation,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,24,M,technician,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,1,53,F,other,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,5,1,33,F,other,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,6,1,42,M,executive,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,10,1,53,M,lawyer,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [39]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(90570, 24) (90570,) (9430, 24) (9430,)


# Feature Engineering

In [40]:
# Numeric based feature
numeric = {
    'age' : tf.feature_column.numeric_column("age"),
}

# Category based feature
# Binary category feature
category = {
        categorical_feature: tf.feature_column.categorical_column_with_identity(categorical_feature,num_buckets=2)
        for categorical_feature in ['unknown','Action', 'Adventure', 'Animation', 'Children', 
                                    'Comedy', 'Crime','Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                                    'Horror', 'Musical','Mystery', 'Romance', 'Sci-Fi', 'Thriller', 
                                    'War', 'Western'] 
}
# Non-binary category feature
category.update({
        'occupation': tf.feature_column.categorical_column_with_vocabulary_list(
            'occupation', vocabulary_list=['technician', 'executive', 'administrator', 'programmer',
                                           'marketing', 'student', 'artist', 'engineer', 'librarian',
                                           'educator', 'other', 'scientist', 'homemaker', 'salesman',
                                           'healthcare', 'entertainment', 'retired', 'writer', 'none',
                                           'lawyer', 'doctor']),
        'gender': tf.feature_column.categorical_column_with_vocabulary_list('gender', vocabulary_list=['M', 'F']),
        'age_buckets': tf.feature_column.bucketized_column(numeric["age"], boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
})

inputs = {
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype='int8') 
          for colname in numeric.keys()
}
inputs.update({
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype='int8') 
          for colname in ['unknown','Action', 'Adventure', 'Animation', 'Children', 
                                    'Comedy', 'Crime','Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                                    'Horror', 'Musical','Mystery', 'Romance', 'Sci-Fi', 'Thriller', 
                                    'War', 'Western']
})
inputs.update({
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype='string') 
          for colname in ["occupation", "gender", "age_buckets"]
})

In [41]:
wide = {col_name: column for col_name, column in category.items()}
wide.update({
    "agebuckets_occupation": tf.feature_column.crossed_column([category["age_buckets"], category['occupation']], 10*21),
    "gender_occupation": tf.feature_column.crossed_column([category["gender"], category['occupation']], 2*21)
})
# one-hot encode the sparse columns
wide = {
    colname : tf.feature_column.indicator_column(col)
          for colname, col in wide.items()
}

In [42]:
wide.keys()

dict_keys(['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'occupation', 'gender', 'age_buckets', 'agebuckets_occupation', 'gender_occupation'])

In [43]:
deep = {
    f"embed_{colname}": tf.feature_column.embedding_column(col, 10) for colname, col in category.items()
}
deep["age"] = numeric["age"]

In [44]:
deep.keys()

dict_keys(['embed_unknown', 'embed_Action', 'embed_Adventure', 'embed_Animation', 'embed_Children', 'embed_Comedy', 'embed_Crime', 'embed_Documentary', 'embed_Drama', 'embed_Fantasy', 'embed_Film-Noir', 'embed_Horror', 'embed_Musical', 'embed_Mystery', 'embed_Romance', 'embed_Sci-Fi', 'embed_Thriller', 'embed_War', 'embed_Western', 'embed_occupation', 'embed_gender', 'embed_age_buckets', 'age'])

# Get the model

In [45]:
DNN_HIDDEN_UNITS = '64,32'

In [46]:
# Build a wide-and-deep model.
def wide_and_deep_classifier(inputs, wide_columns, deep_columns, dnn_hidden_units):
    deep = tf.keras.layers.DenseFeatures(deep_columns, name='deep_inputs')(inputs)
    layers = [int(x) for x in dnn_hidden_units.split(',')]
    for layerno, numnodes in enumerate(layers):
        deep = tf.keras.layers.Dense(numnodes, activation='relu', name='dnn_{}'.format(layerno+1))(deep)        
    wide = tf.keras.layers.DenseFeatures(wide_columns, name='wide_inputs')(inputs)
    both = tf.keras.layers.concatenate([deep, wide], name='both')
    output = tf.keras.layers.Dense(1, activation='sigmoid', name='pred')(both)
    model = tf.keras.Model(inputs, output)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [47]:
model = wide_and_deep_classifier(
    inputs,
    wide_columns = wide.values(),
    deep_columns = deep.values(),
    dnn_hidden_units = DNN_HIDDEN_UNITS)

In [48]:
history = model.fit(x=X_train,
                    y=y_train,
                    validation_data = (X_test, y_test),
                    epochs=50, 
                    steps_per_epoch=20
                   )

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).