Importing Libraries

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import wandb
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_recall_curve, average_precision_score

ModuleNotFoundError: No module named 'tensorflow'

Initializing wandb.ai workspace

In [None]:
# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="IBM Capstone Project",

    # track hyperparameters and run metadata
    config={
    "learning_rate": 0.01,
    "architecture": "RBM-Stacked Autoencoder",
    "dataset": "MovieLens 100K",
    "epochs": 10,
    "batch_size": 64,
    }
)

Importing Data

In [None]:
print(os.getcwd())

ratings = pd.read_csv('ratings.csv', encoding='utf-8')
movies = pd.read_csv('movies.csv', encoding='utf-8')
genome_scores = pd.read_csv('genome-scores.csv', encoding='utf-8')
genome_tags = pd.read_csv('genome-tags.csv', encoding='utf-8')

print(f"Ratings data: {ratings.shape}\n", ratings.head())
print(f"Movies data: {movies.shape}\n", movies.head())
print(f"Genome scores data: {genome_scores.shape}\n", genome_scores.head())
print(f"Genome tags data: {genome_tags.shape}\n", genome_tags.head())

Preprocessing Data

In [None]:
# Merge movies and ratings
data = pd.merge(ratings, movies, on='movieId')

# Create user-movie rating matrix
rating_matrix = data.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Get number of users and movies
n_users = data.userId.nunique()
n_movies = data.movieId.nunique()

# Create movie-tag relevance matrix
movie_tag_matrix = genome_scores.pivot(index='movieId', columns='tagId', values='relevance').fillna(0)

# Combine user-movie rating matrix with movie-tag relevance matrix
combined_matrix = pd.concat([rating_matrix, movie_tag_matrix], axis=1, join='inner').fillna(0)

# Convert to numpy array
training_data = combined_matrix.values

# Update number of visible units
n_visible = training_data.shape[1]

# Split data into training and test sets
train_data, test_data = train_test_split(data_matrix, test_size=0.2, random_state=42)

Configuring Variables

In [None]:
n_hidden = 500 # Number of hidden units

# Placeholder for input data
X = tf.placeholder(tf.float32, [None, n_visible])

# Weights and biases
W = tf.Variable(tf.random_normal([n_visible, n_hidden], mean=0.0, stddev=0.01))
b_visible = tf.Variable(tf.zeros([n_visible]))
b_hidden = tf.Variable(tf.zeros([n_hidden]))

Defining Core Functions 

In [None]:
# Activation functions
def sample_hidden(X):
    return tf.nn.sigmoid(tf.matmul(X, W) + b_hidden)

def sample_visible(H):
    return tf.nn.sigmoid(tf.matmul(H, tf.transpose(W)) + b_visible)

# Gibbs sampling
def gibbs_step(sample_k):
    h_sample = sample_hidden(sample_k)
    v_sample = sample_visible(h_sample)
    h_sample = sample_hidden(v_sample)
    return v_sample, h_sample

# Contrastive Divergence algorithm
def CD_k(X, k=1):
    sample = X
    for i in range(k):
        v_sample, h_sample = gibbs_step(sample)
        sample = v_sample
    return X, h_sample, v_sample

Metric Collection Function

In [None]:
def compute_metrics(true_ratings, predicted_ratings, k=10):
    # Flatten arrays
    true_ratings_flat = true_ratings.flatten()
    predicted_ratings_flat = predicted_ratings.flatten()
    
    # Calculate RMSE and MAE
    rmse = np.sqrt(mean_squared_error(true_ratings_flat, predicted_ratings_flat))
    mae = mean_absolute_error(true_ratings_flat, predicted_ratings_flat)

    # Precision@k, Recall@k, and NDCG are more complex in implementation
    # Here we simplify by using precision and recall at different thresholds
    
    # Compute Precision-Recall curve
    precision, recall, _ = precision_recall_curve(true_ratings_flat, predicted_ratings_flat)
    
    # Compute average precision score
    map_score = average_precision_score(true_ratings_flat, predicted_ratings_flat)

    return rmse, mae, precision, recall, map_score

def log_metrics_to_wandb(true_ratings, predicted_ratings, epoch):
    rmse, mae, precision, recall, map_score = compute_metrics(true_ratings, predicted_ratings)

    # Log metrics
    wandb.log({
        "epoch": epoch,
        "RMSE": rmse,
        "MAE": mae,
        "MAP": map_score
    })
    
    # Log precision-recall curve
    wandb.log({
        "precision-recall": wandb.plot.precision_recall(y_true=true_ratings.flatten(), y_probas=predicted_ratings.flatten(), labels=None)
    })

    # Log histogram of predicted ratings
    wandb.log({
        "predicted_ratings_histogram": wandb.Histogram(predicted_ratings.flatten())
    })

Initialziing and Running Training Loop 

In [None]:
# Input for reconstruction
X_sample, h_sample, v_sample = CD_k(X)

# Loss function
loss = tf.reduce_mean(tf.square(X - v_sample))

# Optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=wandb.config.learning_rate)
train_op = optimizer.minimize(loss)

# Training
n_epochs = wandb.config.epochs
batch_size = wandb.config.batch_size

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        avg_loss = 0
        total_batches = len(train_data) // batch_size
        for i in range(total_batches):
            batch = train_data[i * batch_size:(i + 1) * batch_size]
            _, l = sess.run([train_op, loss], feed_dict={X: batch})
            avg_loss += l / total_batches
        print("Epoch:", '%04d' % (epoch + 1), "loss=", "{:.9f}".format(avg_loss))
        wandb.log({"training_loss": avg_loss})
    
    # Get the trained weights
    trained_weights = sess.run(W)

Generating training reports

In [None]:
wandb.finish()

# Evaluate on test set
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(W.assign(trained_weights))
    predicted_ratings = sess.run(gibbs_step(test_data))[0]

# Log metrics for the test set
log_metrics_to_wandb(test_data, predicted_ratings, epoch=n_epochs)

Function to get movie recommendations

In [None]:
def get_movie_recommendations(new_user_movies, top_n=10):
    user_vector = np.zeros((1, n_visible))

    for movie_title, rating in new_user_movies:
        # Find the movie ID for the given title
        movie_id = movies[movies['title'] == movie_title]['movieId'].values[0]
        user_vector[0, movie_id] = rating

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(W.assign(trained_weights))
        v_sample, _ = sess.run(gibbs_step(user_vector))
    
    # Get the top N movie recommendations
    recommendations = np.argsort(v_sample[0])[::-1][:top_n]
    
    # Filter out movies the user has already rated
    rated_movie_ids = [movies[movies['title'] == title]['movieId'].values[0] for title, _ in new_user_movies]
    recommendations = [rec for rec in recommendations if rec not in rated_movie_ids]
    
    recommended_movies = movies[movies['movieId'].isin(recommendations)]
    
    return recommended_movies['title'].values

In [None]:
# Example usage
new_user_movies = [("Toy Story (1995)", 5.0), ("Jumanji (1995)", 4.0)]
print(get_movie_recommendations(new_user_movies))