In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import tensorflow as tf
from tensorflow import keras
import tensorflow_recommenders as tfrs
import os
import pyinputplus as pyip

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import logging
logger = tf.get_logger()
logger.setLevel(logging.ERROR)

2024-02-19 19:18:05.911289: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-19 19:18:06.034038: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-19 19:18:06.034107: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-19 19:18:06.039713: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-19 19:18:06.079801: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-19 19:18:06.081223: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [8]:
df = pd.read_csv('/home/ubuntu/recommender_system/data/processed/clean_data.csv')
for col in df.columns:
    if col not in ['rating','Age']:
        df[col] = df[col].astype(str)
    else:
        df[col] = df[col].astype(int)

In [3]:
# Convert df to dictionary
df_dict = {name: np.array(val) for name, val in df.items()}

# Convert dictionary to tensor slices
data = tf.data.Dataset.from_tensor_slices(df_dict)

In [9]:
# Get a dictionary of unique values

vocabularies = {}

for feature in df_dict:
    if feature != 'rating':
        vocab = np.unique(df_dict[feature])
        vocabularies[feature] = vocab

In [5]:
vocabularies

{'user': array(['1', '1002', '1003', ..., '991', '995', '999'], dtype=object),
 'Book-Title': array(["'48", "'N Sync", "'Salem's Lot", ..., 'wet sand, raven tracks',
        'Â¿QuiÃ©n se ha llevado mi queso?', 'Ã?Â?thique en toc'],
       dtype=object),
 'Book-Author': array(['A. A. Attanasio', 'A. A. Milne', 'A. Bry', ...,
        'jr., Richard Herman', 'padriac colum', 'stephen R Donaldson'],
       dtype=object),
 'Age': array([ 1,  2,  4,  9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
        25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
        42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
        59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 71, 72, 75, 79, 83, 90])}

In [6]:
# converting book-title to a tensorflow dataset
book_titles = tf.data.Dataset.from_tensor_slices(vocabularies['Book-Title'])
book_authors = df['Book-Author'].unique()
user_age = df['Age'].values

In [7]:
# Shuffl and split the dataset into train, validation and test
tf.random.set_seed(42)

shuffled = data.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(46_797)
validation = shuffled.skip(46_797).take(9_359)
test = shuffled.skip(56_156).take(6_240)

### Model Building

In [23]:
class UserModel(tf.keras.Model):
  
    def __init__(self):
        super().__init__()
        
        max_tokens = 10_000
        
        # 1. User ID
        self.user_id_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=vocabularies['user'],
                mask_token=None),
            tf.keras.layers.Embedding(len(vocabularies['user'])+1, 32)
        ])
             
        
        #2. Book Authors
        self.author_vectorizer = keras.layers.TextVectorization(max_tokens=max_tokens)
        self.author_vectorizer.adapt(book_authors)
        self.author_text_embedding = keras.Sequential([
            self.author_vectorizer,
            keras.layers.Embedding(max_tokens, 32, mask_zero=True),
            keras.layers.GlobalAveragePooling1D()
        ])
        
        self.author_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=vocabularies['Book-Author'],
                mask_token=None),
            tf.keras.layers.Embedding(len(vocabularies['Book-Author'])+1, 32)
        ])
         
        
        # 3. User age
        self.normalized_age = keras.layers.Normalization()
        self.normalized_age.adapt(vocabularies['Age'].reshape(-1,1))
        
    # call method passes out input features to the embeddings above, excutes them and returns the output
    def call(self, inputs):
        
        return tf.concat([
            self.user_id_embedding(inputs['user']),
            self.author_embedding(inputs['Book-Author']),
            self.author_text_embedding(inputs['Book-Author']),
            tf.reshape(self.normalized_age(inputs['Age']), (-1,1))
        ], axis=1) 

In [24]:
class TitleModel(tf.keras.Model):
    
    def __init__(self,):
        super().__init__()
        
        max_tokens = 10_000
        
        #1. Book-Titles
        self.book_vectorizer = keras.layers.TextVectorization(max_tokens=max_tokens)
        self.book_vectorizer.adapt(book_titles)
        self.book_text_embedding = keras.Sequential([
            self.book_vectorizer,
            keras.layers.Embedding(max_tokens, 32, mask_zero=True),
            keras.layers.GlobalAveragePooling1D()
        ])
        
        self.book_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=vocabularies['Book-Title'],
                mask_token=None),
            tf.keras.layers.Embedding(len(vocabularies['Book-Title'])+1, 32)
        ])
        
        
    # call method passes category to the embedding layer above, executes it and returns the output embeddings
    def call(self, inputs):
        
        return tf.concat([
            self.book_embedding(inputs),
            self.book_text_embedding(inputs),
        ], axis=1)

In [25]:
tf.random.set_seed(7)
np.random.seed(7)


class FullModel(tfrs.models.Model):
    
    def __init__(self,):
        super().__init__()
        
        # handles how much weight we want to assign to the rating and retrieval task when computing loss
        self.rating_weight = 0.5
        self.retrieval_weight = 0.5
        
        #User model
        self.user_model = tf.keras.Sequential([
            UserModel(),
            tf.keras.layers.Dense(32),
        ])
        
        # Category model
        self.title_model = tf.keras.Sequential([
            TitleModel(),
            tf.keras.layers.Dense(32)
        ])
        
        
        # Deep & Cross layer
        self._cross_layer = tfrs.layers.dcn.Cross(projection_dim=None, kernel_initializer='he_normal')
        
        # Dense layers with l2 regularization to prevent overfitting
        self._deep_layers = [
            keras.layers.Dense(512, activation='relu', kernel_regularizer='l2'),
            keras.layers.Dense(256, activation='relu', kernel_regularizer='l2'),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(128, activation='relu', kernel_regularizer='l2'),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(64, activation='relu', kernel_regularizer='l2'),
            keras.layers.Dense(32, activation='relu', kernel_regularizer='l2'),
        ]
        
        # output layer
        self._logit_layer = keras.layers.Dense(1)
    
        # Multi-task Retrieval & Ranking
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=book_titles.batch(128).map(self.title_model)
            )
        )
       
            
    def call(self, features) -> tf.Tensor:
        user_embeddings = self.user_model({
            'user': features['user'],
            'Book-Author': features['Book-Author'],
            'Age': features['Age'],
        })
        
        
        title_embeddings = self.title_model(
            features['Book-Title']
        )
        
        x = self._cross_layer(tf.concat([
                user_embeddings,
                title_embeddings], axis=1))
        
        for layer in self._deep_layers.layers:
            x = layer(x)
            
        
        return (
            user_embeddings, 
            title_embeddings,
            self._logit_layer(x)
        )
        
        
        

    def compute_loss(self, features, training=False) -> tf.Tensor:
        user_embeddings, title_embeddings, rating_predictions = self.call(features)
        # Retrieval loss
        retrieval_loss = self.retrieval_task(user_embeddings, title_embeddings)
        # Rating loss
        rating_loss = self.rating_task(
            labels=features['rating'],
            predictions=rating_predictions
        )
        
        # Combine two losses with hyper-parameters (to be tuned)
        return (self.rating_weight * rating_loss + self.retrieval_weight * retrieval_loss)

### Training and evaluating the Model

In [26]:
# Batch and cache datasets to improve performance
cached_train = train.shuffle(143_000).batch(2000).cache()
cached_validation = validation.shuffle(30_000).batch(2000).cache()
cached_test = test.batch(1000).cache()

In [27]:
# Train and fit the FullModel 

keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

# calling and training our model

model = FullModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model.fit(cached_train, validation_data=cached_validation, epochs=20)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f1185f3e610>

In [30]:
# Save model
filepath = "../model/model_weights/"
model.save_weights(filepath=filepath, save_format="tf")

In [12]:
def build_model():
    """Instantiates a model and compiles it."""
    keras.backend.clear_session()
    tf.random.set_seed(42)
    np.random.seed(42)

    # instantiating the model
    model = FullModel()
    model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

    return model

In [13]:
# loading the saved model weights
from tensorflow import keras

filepath = "../model/model_weights/"

# loading the model
model = build_model()

# load the weights back to the new model
model.load_weights(filepath)

NameError: name 'FullModel' is not defined

### Evaluating our model on our test dataset

In [33]:
scores = model.evaluate(cached_test, return_dict=True, verbose=False)

In [34]:
scores

{'root_mean_squared_error': 1.7476991415023804,
 'factorized_top_k/top_1_categorical_accuracy': 0.27192696928977966,
 'factorized_top_k/top_5_categorical_accuracy': 0.6801434755325317,
 'factorized_top_k/top_10_categorical_accuracy': 0.7714378833770752,
 'factorized_top_k/top_50_categorical_accuracy': 0.8751222491264343,
 'factorized_top_k/top_100_categorical_accuracy': 0.8937072157859802,
 'loss': 83.88762664794922,
 'regularization_loss': 6.503206729888916,
 'total_loss': 90.39083099365234}

### Create a function that will recommend Books for a user based on their User ID, Age, and Specific Author

In [35]:
# Create input validation functions
def validate_number(value):
    try:
        number = int(value)
        if number in range(0,100):
            return number
        else:
            raise ValueError("Invalid Age")
    except ValueError:
        raise ValueError("Invalid Age")


def validate_author(value):
    if value in vocabularies['Book-Author']:
        return value
    else:
        raise ValueError("Invalid Author Name")
    
    
def validate_user(value):
    if value in vocabularies['user']:
        return value
    else:
        raise ValueError("Invalid User-ID")


In [36]:
# Create recommendation functions
def Recommend():
    input_user = pyip.inputCustom(validate_user, prompt="Enter your User-ID: \n")
    input_author = pyip.inputCustom(validate_author, prompt="Enter an Author name: \n")
    input_age = pyip.inputCustom(validate_number, prompt="Enter your Age: \n")
    top_k = pyip.inputNum("Number of recommendations: \n")
        
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=top_k)
    index.index_from_dataset(
    tf.data.Dataset.zip((book_titles.batch(1000), book_titles.batch(1000).map(model.title_model)))
    )
    
    raw_input = {
        'Age': input_age,
        'Book-Author': input_author,
        'user': input_user
    }
    
    input_dict = {key: tf.constant(np.array([value])) for key, value in raw_input.items()}
    
    _, titles = index(input_dict)
    
    test_rating = {}
    for book in titles.numpy()[0]:
        raw_input['Book-Title'] = book

        input_dict = {key: tf.constant(np.array([value])) for key, value in raw_input.items()}

        trained_movie_embeddings, trained_user_embeddings, predicted_rating = model(input_dict)
        test_rating[book] = predicted_rating


    sorted_dict = sorted(test_rating.items(), key=lambda x: x[1], reverse=True)
    

    print(f"Top {top_k} recommendations:\n")
    for i, (k, v) in enumerate(sorted_dict):
        print(' '*2,'-',k,)



In [37]:
Recommend()

Enter your User-ID: 
Invalid User-ID
Enter your User-ID: 
Invalid User-ID
Enter your User-ID: 
Invalid User-ID
Enter your User-ID: 
Enter an Author name: 
Enter your Age: 
Number of recommendations: 

Getting your 5 book recommendations. Please be patient
   - b"The Vintage Bradbury: Ray Bradbury's Own Selection of His Best Stories"
   - b'Fahrenheit 451 and Related Readings'
   - b'Martian Chronicles'
   - b'Fahrenheit 451 - T.D. -'
   - b'The Halloween Tree'
