In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split


In [22]:
from collections import defaultdict
import csv

Preparing the data

In [2]:
# importing the data
item_train = pd.read_csv('content_item_train.csv')
user_train = pd.read_csv('content_user_train.csv')
y_train = pd.read_csv('content_y_train.csv')

num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
print(f"Number of training vectors: {len(item_train)}")



Number of training vectors: 50883


In [3]:
print("num_user_features", num_user_features)

num_user_features 14


In [4]:
# scale training data
item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

scalarItem = StandardScaler()
scalarItem.fit(item_train)
item_train = scalarItem.transform(item_train)

scalarUser = StandardScaler()
scalarUser.fit(user_train)
user_train = scalarUser.transform(user_train)

# Convert y_train to a NumPy array and reshape it
y_train_array = y_train.values.reshape(-1, 1)


scalarTarget = MinMaxScaler((-1,1))
scalarTarget.fit(y_train_array)
y_train= scalarTarget.transform(y_train_array)


print(np.allclose(item_train_unscaled, scalarItem.inverse_transform(item_train)))
print(np.allclose(user_train_unscaled, scalarUser.inverse_transform(user_train)))

True
True


In [5]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (40706, 17)
movie/item test data shape: (10177, 17)


In [6]:
print("Shape of user_train slice:", user_train[:, u_s:].shape)
print("Shape of item_train slice:", item_train[:, i_s:].shape)

Shape of user_train slice: (40706, 14)
Shape of item_train slice: (40706, 16)


In [7]:
from tensorflow.keras import layers

num_outputs = 32

tf.random.set_seed(1)

class L2NormalizationLayer(layers.Layer):
    def call(self, inputs):
        return tf.linalg.l2_normalize(inputs, axis=1)

user_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dense(num_outputs),
])

item_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dense(num_outputs),
])

# create the user input and point to the base network

input_user = tf.keras.Input(shape=(num_user_features,))
vu = user_NN(input_user)
vu = L2NormalizationLayer()(vu)

# create the item input and point to the base network

input_item = tf.keras.Input(shape=(num_item_features,))
vm = item_NN(input_item)
vm = L2NormalizationLayer()(vm)

# compute the dot product of the two vectors vu and vm

output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model

model = tf.keras.Model(inputs=[input_user, input_item], outputs=output)

model.summary()





In [8]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.1)
model.compile(optimizer=opt,
              loss=cost_fn)

In [9]:
tf.random.set_seed(1)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], y_train, epochs=50)

Epoch 1/50
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1316
Epoch 2/50
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.1170
Epoch 3/50
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1135
Epoch 4/50
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1105
Epoch 5/50
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.1078
Epoch 6/50
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.1051
Epoch 7/50
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.1011
Epoch 8/50
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0983
Epoch 9/50
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.0960
Epoch 10/50
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x2b1ab214950>

In [10]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_test)

[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 850us/step - loss: 0.0861


0.08375869691371918

In [11]:
new_user_id = 5000
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

In [12]:

def gen_user_vecs(user_vec, num_items):
    """ given a user vector return:
        user predict maxtrix to match the size of item_vecs """
    user_vecs = np.tile(user_vec, (num_items, 1))
    return user_vecs

def print_pred_movies(y_p, item, movie_dict, maxcount=10):
    """ print results of prediction of a new user. inputs are expected to be in
        sorted order, unscaled. """
    count = 0
    disp = [["y_p", "movie id", "rating ave", "title", "genres"]]

    for i in range(0, y_p.shape[0]):
        if count == maxcount:
            break
        count += 1
        movie_id = item[i, 0].astype(int)
        disp.append([np.around(y_p[i, 0], 1), item[i, 0].astype(int), np.around(item[i, 2].astype(float), 1),
                     movie_dict[movie_id]['title'], movie_dict[movie_id]['genres']])

    table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")
    return table

In [14]:
item_vecs = pd.read_csv("content_item_vecs.csv")

In [23]:
movie_dict = defaultdict(dict)
count = 0
#    with open('./data/movies.csv', newline='') as csvfile:
with open('content_movie_list.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for line in reader:
        if count == 0:
            count += 1  #skip header
            #print(line) print
        else:
            count += 1
            movie_id = int(line[0])
            movie_dict[movie_id]["title"] = line[1]
            movie_dict[movie_id]["genres"] = line[2]

In [24]:
# generate and replicate the user vector to match the number movies in the data set.
user_vecs = gen_user_vecs(user_vec,len(item_vecs))

# scale our user and item vectors
suser_vecs = scalarUser.transform(user_vecs)
sitem_vecs = scalarItem.transform(item_vecs)

# make a prediction
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

# unscale y prediction 
y_pu = scalarTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display

print_pred_movies(sorted_ypu, sorted_items, movie_dict, maxcount = 10)



ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- 0.11
- 2.84375
- 2001
- 4054
Feature names seen at fit time, yet now missing:
- 1.2
- 2003
- 3.9618320610687023
- 6874
