In [48]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import feature_column
from tensorflow.keras.layers import Embedding, Input, Dot, Dense, Flatten, Multiply, Concatenate, DenseFeatures
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adagrad, Adam, SGD, RMSprop
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
import datetime

# Preprocess data

In [2]:
def preprocess_data(data_path):
    header_data = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv(data_path, sep='\t', names=header_data)
    data = data.drop("timestamp", axis=1).astype(int)
    
    num_items = data["item_id"].max()
    num_users = data["user_id"].max()
    
    y = data["rating"]
    y = np.where(y==5, 1, 0)

    X = data.drop("rating", axis=1)
    # Minus 1 so the index starts from 0
    X = X-1
    X = X.astype('category')
    
    return X, y, num_items, num_users

In [37]:
# Read data
# User data
header_user = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
data_user = pd.read_csv('data/u.user', sep='|', names=header_user)
data_user = data_user.drop(['zip_code'], axis=1)

# Item data
header_item = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
        'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
        'Thriller', 'War', 'Western']
data_item = pd.read_csv('data/u.item', sep='|', names=header_item, encoding = "ISO-8859-1")
data_item = data_item.drop(['release_date', 'title', 'video_release_date', 'IMDb_URL'], axis=1)

# User Item interaction data
X_train, y_train, num_items, num_users = preprocess_data("data/ub.base")
X_test , y_test, _, _ = preprocess_data("data/ub.test")


In [38]:
# Data merge and label separation
X_train = X_train.merge(data_user, left_on="user_id", right_on="user_id", suffixes=(False, False))
X_train = X_train.merge(data_item, left_on="item_id", right_on="item_id")
X_test = X_test.merge(data_user, left_on="user_id", right_on="user_id")
X_test = X_test.merge(data_item, left_on="item_id", right_on="item_id")

In [65]:
X_train.head()

Unnamed: 0,user_id,item_id,age,gender,occupation,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,9,24,M,technician,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,9,42,M,executive,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,48,9,45,M,administrator,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,58,9,27,M,programmer,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,62,9,27,F,administrator,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
X_train["occupation"].unique()

array(['technician', 'executive', 'administrator', 'programmer',
       'marketing', 'student', 'artist', 'engineer', 'librarian',
       'educator', 'other', 'scientist', 'homemaker', 'salesman',
       'healthcare', 'entertainment', 'retired', 'writer', 'none',
       'lawyer', 'doctor'], dtype=object)

In [4]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(90570, 2) (90570,) (9430, 2) (9430,)


# Get model

In [83]:
real = {
    'age' : tf.feature_column.numeric_column(["age"]) 
}

sparse = {
        categorical_feature: tf.feature_column.categorical_column_with_identity(categorical_feature,num_buckets=2)
        for categorical_feature in ['unknown','Action', 'Adventure', 'Animation', 'Children', 
                                    'Comedy', 'Crime','Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                                    'Horror', 'Musical','Mystery', 'Romance', 'Sci-Fi', 'Thriller', 
                                    'War', 'Western'] 
}
sparse.update({
        'occupation': tf.feature_column.categorical_column_with_vocabulary_list(
            'occupation', vocabulary_list=['technician', 'executive', 'administrator', 'programmer',
                                           'marketing', 'student', 'artist', 'engineer', 'librarian',
                                           'educator', 'other', 'scientist', 'homemaker', 'salesman',
                                           'healthcare', 'entertainment', 'retired', 'writer', 'none',
                                           'lawyer', 'doctor']),
        'gender': tf.feature_column.categorical_column_with_vocabulary_list(
            'gender', vocabulary_list=['M', 'F'])})

In [67]:
sparse

{'unknown': VocabularyListCategoricalColumn(key='unknown', vocabulary_list=(1, 0), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 'Action': VocabularyListCategoricalColumn(key='Action', vocabulary_list=(1, 0), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 'Adventure': VocabularyListCategoricalColumn(key='Adventure', vocabulary_list=(1, 0), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 'Animation': VocabularyListCategoricalColumn(key='Animation', vocabulary_list=(1, 0), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 'Children': VocabularyListCategoricalColumn(key='Children', vocabulary_list=(1, 0), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 'Comedy': VocabularyListCategoricalColumn(key='Comedy', vocabulary_list=(1, 0), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 'Crime': VocabularyListCategoricalColumn(key='Crime', vocabulary_list=(1, 0), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 'Documentary': VocabularyListCategoric