# Reinforcement Learning for Movie Recommendations with Learned Embeddings

This notebook implements a content-based movie recommender system using reinforcement learning (multi-armed bandits), building on the ideas introduced in the paper: "Reinforcement Learning Approaches to Movies Recommendation" by Carpentier et al (2015).

This modernises the original approach by replacing PCA-engineered features with deep learned embeddings.

We train a neural autoencoder to generate embeddings of movies based on their genres and release year. These embeddings are used in a contextual bandit setting to simulate personalised movie recommendations for new users.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go


import plotly.io as pio
pio.renderers.default = 'colab'

import os
import random
import math
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from scipy.stats import spearmanr
import datetime

%matplotlib inline
from multiprocessing import Pool, cpu_count
from functools import partial


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import mean_squared_error


from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, MinMaxScaler
from sklearn.preprocessing import normalize

In [3]:
SELECTED_DATA_DIR = "/content/drive/My Drive/RL-movie-recommender-master/selected-data/"
MOVIES_FILE = "best_movie_ratings_features.csv"
USERS_FILE = "users_ratings.csv"

In [4]:
movies = pd.read_csv(SELECTED_DATA_DIR + MOVIES_FILE, index_col=0)
movies.head()

Unnamed: 0_level_0,aka,genres,year,votes,rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The Shawshank Redemption (1994),"['Die Verurteilten (1995)', 'Die Verurteilten ...","['Crime', 'Drama']",[1994],1740070,9.3
The Dark Knight (2008),"['Batman Begins 2 (2005)', 'Batman: The Dark K...","['Action', 'Crime', 'Drama', 'Thriller']",[2008],1724382,9.0
Inception (2010),"['Inception: The IMAX Experience (2010)', ""Oli...","['Action', 'Adventure', 'Sci-Fi', 'Thriller']",[2010],1515276,8.8
Fight Club (1999),"['Fight Club (1999)', 'Fight Club (1999)', 'Fi...",['Drama'],[1999],1389487,8.8
Pulp Fiction (1994),"['Black Mask (1994)', 'Pulp Fiction (1994)', '...","['Crime', 'Drama']",[1994],1362518,8.9


In [5]:
users = pd.read_csv(SELECTED_DATA_DIR + USERS_FILE, index_col=0)
users.head()

Unnamed: 0_level_0,user,rating,link
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Pianist (2002),59957513,9,tt0253474
Schindler's List (1993),59957513,9,tt0108052
Reservoir Dogs (1992),59957513,7,tt0105236
Captain Phillips (2013),59957513,8,tt1535109
Goodfellas (1990),59957513,8,tt0099685


# Learning Dense Movie Representations with an Autoencoder

In [6]:
# Preprocess movie features for autoencoder
# Genre encoding
genre_mlb = MultiLabelBinarizer()
genre_features = genre_mlb.fit_transform(movies['genres'])

# Year encoding
year_encoder = OneHotEncoder(sparse_output=False)
year_features = year_encoder.fit_transform(movies[['year']])

# Final input
import numpy as np
X = np.hstack([genre_features, year_features])

X.shape

(1000, 71)

Feature Vector: 71-Dimensional Encoding of Genre and Year

In [7]:
len(genre_mlb.classes_)

35

In [8]:
len(year_encoder.categories_[0])

36

### Autoencoder Architecture

In [9]:
class MovieAutoencoder(nn.Module):
    def __init__(self, input_dim, embedding_dim=16):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 48),
            nn.ReLU(),
            nn.Linear(48, embedding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 48),
            nn.ReLU(),
            nn.Linear(48, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z), z

### Train Autoencoder

In [10]:
X_tensor = torch.tensor(X, dtype=torch.float32)
dataset = TensorDataset(X_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

model = MovieAutoencoder(input_dim=X.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(50):
    for batch in dataloader:
        inputs = batch[0]
        outputs, _ = model(inputs)
        loss = criterion(outputs, inputs)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 0.2304
Epoch 2, Loss: 0.1616
Epoch 3, Loss: 0.1005
Epoch 4, Loss: 0.0915
Epoch 5, Loss: 0.0910
Epoch 6, Loss: 0.0891
Epoch 7, Loss: 0.0883
Epoch 8, Loss: 0.0835
Epoch 9, Loss: 0.0817
Epoch 10, Loss: 0.0748
Epoch 11, Loss: 0.0639
Epoch 12, Loss: 0.0614
Epoch 13, Loss: 0.0574
Epoch 14, Loss: 0.0567
Epoch 15, Loss: 0.0555
Epoch 16, Loss: 0.0511
Epoch 17, Loss: 0.0447
Epoch 18, Loss: 0.0456
Epoch 19, Loss: 0.0403
Epoch 20, Loss: 0.0406
Epoch 21, Loss: 0.0447
Epoch 22, Loss: 0.0386
Epoch 23, Loss: 0.0361
Epoch 24, Loss: 0.0357
Epoch 25, Loss: 0.0372
Epoch 26, Loss: 0.0342
Epoch 27, Loss: 0.0413
Epoch 28, Loss: 0.0313
Epoch 29, Loss: 0.0311
Epoch 30, Loss: 0.0288
Epoch 31, Loss: 0.0276
Epoch 32, Loss: 0.0218
Epoch 33, Loss: 0.0262
Epoch 34, Loss: 0.0303
Epoch 35, Loss: 0.0320
Epoch 36, Loss: 0.0304
Epoch 37, Loss: 0.0271
Epoch 38, Loss: 0.0247
Epoch 39, Loss: 0.0247
Epoch 40, Loss: 0.0224
Epoch 41, Loss: 0.0254
Epoch 42, Loss: 0.0233
Epoch 43, Loss: 0.0240
Epoch 44, Loss: 0.02

### Save model

In [None]:
torch.save(model, "movie_autoencoder_full.pt")

### Generate embeddings for all movies

In [11]:
with torch.no_grad():
    _, embeddings = model(X_tensor)
    movie_embeddings = {
        title: embeddings[i].numpy()
        for i, title in enumerate(movies.index)
    }

In [12]:
movie_embeddings['The Shawshank Redemption (1994)']

array([ 0.65206486, -2.8543425 , -4.710934  , -0.6585612 , -1.3460212 ,
       -3.3591096 , -7.725502  , -5.72451   ,  3.4856524 , -5.4610915 ,
       -5.7343073 , -5.1769156 , -2.1738262 , -2.5363288 , -2.9946265 ,
        0.91461474], dtype=float32)

In [13]:
for title, vec in list(movie_embeddings.items())[:3]:
    print(f"{title}: L2 norm = {np.linalg.norm(vec):.4f}")

The Shawshank Redemption (1994): L2 norm = 16.1049
The Dark Knight (2008): L2 norm = 22.4713
Inception (2010): L2 norm = 20.8126


### Normalize movie embeddings (L2 norm)

In [14]:
embedding_matrix = np.array(list(movie_embeddings.values()))
embedding_matrix = normalize(embedding_matrix, axis=1)

normalized_movie_embeddings = {
    title: vec for title, vec in zip(movie_embeddings.keys(), embedding_matrix)
}

normalized_movie_embeddings['The Shawshank Redemption (1994)']

array([ 0.04048869, -0.1772348 , -0.29251623, -0.04089207, -0.08357855,
       -0.20857733, -0.4796999 , -0.35545224,  0.21643476, -0.33909577,
       -0.3560606 , -0.32145044, -0.13497949, -0.15748838, -0.18594547,
        0.05679121], dtype=float32)

In [15]:
for title, vec in list(normalized_movie_embeddings.items())[:3]:
    print(f"{title}: L2 norm = {np.linalg.norm(vec):.4f}")

The Shawshank Redemption (1994): L2 norm = 1.0000
The Dark Knight (2008): L2 norm = 1.0000
Inception (2010): L2 norm = 1.0000


In [16]:
normalized_movie_embeddings = pd.DataFrame.from_dict(
    normalized_movie_embeddings, orient='index',
    columns=[f'x{i}' for i in range(16)]
)
normalized_movie_embeddings

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15
The Shawshank Redemption (1994),0.040489,-0.177235,-0.292516,-0.040892,-0.083579,-0.208577,-0.479700,-0.355452,0.216435,-0.339096,-0.356061,-0.321450,-0.134979,-0.157488,-0.185945,0.056791
The Dark Knight (2008),0.265770,-0.279263,-0.260202,-0.006795,-0.204157,-0.153563,-0.418174,-0.331575,0.168750,-0.307356,-0.156308,-0.398280,0.072535,0.062098,-0.344125,-0.003356
Inception (2010),0.368499,-0.221994,-0.015522,0.095429,-0.341390,-0.145162,-0.212840,-0.253383,0.218281,-0.172051,0.184868,-0.340769,0.182579,0.292449,-0.443362,-0.124409
Fight Club (1999),-0.139781,-0.037254,-0.204438,-0.118792,-0.004362,-0.272287,-0.382073,-0.289278,0.290755,-0.362701,-0.373312,-0.354803,-0.246232,-0.168202,-0.116588,0.188397
Pulp Fiction (1994),0.040489,-0.177235,-0.292516,-0.040892,-0.083579,-0.208577,-0.479700,-0.355452,0.216435,-0.339096,-0.356061,-0.321450,-0.134979,-0.157488,-0.185945,0.056791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Scream 2 (1997),0.068635,-0.172656,0.163464,0.008318,-0.038212,-0.216996,-0.430129,-0.355004,0.145510,-0.109902,-0.241735,-0.289554,0.075948,-0.431999,-0.430142,-0.161743
Clueless (1995),0.073121,-0.087989,-0.149799,-0.269656,-0.245461,-0.361670,-0.352939,-0.401788,0.337821,-0.351246,-0.223135,-0.238889,-0.023045,-0.107833,-0.211172,-0.116594
The Lovely Bones (2009),0.238320,-0.302549,-0.105049,-0.002936,-0.155481,-0.173861,-0.536284,-0.208235,0.046686,-0.119763,-0.198429,-0.442071,-0.011685,-0.036567,-0.441288,-0.087901
27 Dresses (2008),0.063186,-0.071296,-0.147562,-0.272428,-0.237035,-0.362062,-0.353137,-0.405446,0.343219,-0.355161,-0.220062,-0.234302,-0.028533,-0.115031,-0.210772,-0.113277


# Content-Based Reinforcement Learning with Learned Movie Embeddings

In [17]:
# --- UTILITY FUNCTIONS ---
def compute_utility(user_features, movie_features, epoch, s):
    return user_features.dot(movie_features) * (1 - math.exp(-epoch / s))

def compute_UCB(epoch, Nt):
    return math.sqrt((2 * math.log2(epoch + 1)) / (Nt * epoch)) if Nt > 0 else 0

def get_movie_features(title):
    return normalized_movie_embeddings.loc[title].values.astype(np.float32)

def iterative_mean(old, new, t):
    return ((t - 1) / t) * old + (1 / t) * new

def update_features(user_features, movie_features, rating, t):
    return iterative_mean(user_features, movie_features * rating, t + 1)

# --- POLICY CLASSES ---
class Algorithm:
    def update_features(self, user_features, movie_features, rating, t):
        return update_features(user_features, movie_features, rating, t)

    def compute_utility(self, user_features, movie_features, epoch, s):
        return compute_utility(user_features, movie_features, epoch, s)

class RandomPolicy(Algorithm):
    def choice(self, user_features, movies, epoch, s):
        return movies.sample(1)

class GreedyPolicy(Algorithm):
    def choice(self, user_features, movies, epoch, s):
        return best_contentbased_recommendation(user_features, movies, epoch, s)

class EpsilonGreedyPolicy(Algorithm):
    def __init__(self, epsilon=0.3):
        self.epsilon = epsilon

    def choice(self, user_features, movies, epoch, s):
        if random.random() < self.epsilon:
            return movies.sample(1)
        else:
            return best_contentbased_recommendation(user_features, movies, epoch, s)

In [18]:
# --- RECOMMENDATION LOGIC ---
def best_contentbased_recommendation(user_features, movies, epoch, s):
    utilities = np.zeros(len(movies))
    for i, (title, movie) in enumerate(movies.iterrows()):
        features = get_movie_features(title)
        utilities[i] = compute_utility(user_features, features, epoch - movie.last_t, s)
        utilities[i] += compute_UCB(epoch, movie.Nt)
    return movies.iloc[[utilities.argmax()]]

In [19]:
# --- SIMULATION ---
def reinforcement_learning(user, movies, algorithm, s, steps):
    algo = algorithm() if callable(algorithm) else algorithm
    user_features = np.zeros(normalized_movie_embeddings.shape[1])

    movies_sim = movies.copy()
    movies_sim['last_t'] = 1
    movies_sim['t'] = range(len(movies_sim))
    movies_sim['rating'] = user['rating']
    movies_sim['Nt'] = 0

    cumregret, accuracy_rmse, avg_rating, timestamps = [0], [0], [0], []
    watched_titles = set()  # NEW: track watched movies

    for t in range(steps):
        start = datetime.datetime.now()

        rec = algo.choice(user_features, movies_sim, t + 1, s)
        rec_title = rec.index[0]
        rec_feat = get_movie_features(rec_title)

        actual_rating = user.loc[rec_title, 'rating']
        user_features = algo.update_features(user_features, rec_feat, actual_rating, t)
        utility = algo.compute_utility(user_features, rec_feat, t + 1, s)

        watched_titles.add(rec_title)

        unwatched = user[~user.index.isin(watched_titles)]
        if len(unwatched) > 0:
            best_possible_rating = unwatched['rating'].max()
        else:
            best_possible_rating = actual_rating

        true_regret = best_possible_rating - actual_rating

        cumregret.append(cumregret[-1] + true_regret)
        accuracy_rmse.append((actual_rating - utility) ** 2)
        avg_rating.append(actual_rating)

        movies_sim.loc[rec.index, 'last_t'] = t
        movies_sim.loc[rec.index, 'Nt'] += 1
        timestamps.append((datetime.datetime.now() - start).total_seconds())

    return {
        'cumregret': cumregret,
        'accuracy_rmse': accuracy_rmse,
        'avg_rating': avg_rating,
        'timediff': timestamps
    }

In [20]:
# --- MULTI-USER SIMULATION ---
def simulate_multiple_users(users_df, movies_df, algorithms, steps=20, n_users=10, s=200):
    results_all = []
    users_sample = users_df[users_df.user.isin(
        pd.Series(users_df.user.unique()).sample(n_users, random_state=42)
    )]

    movies_sample = movies_df.loc[users_sample.index.unique()]

    for algo in tqdm(algorithms):
        algo_results = []
        for user_id in users_sample.user.unique():
            user = users_sample[users_sample.user == user_id]
            movies_user = movies_sample.loc[movies_sample.index.isin(user.index)]
            res = reinforcement_learning(user, movies_user, algo, s, steps)
            algo_results.append(res)
        results_all.append(algo_results)

    return results_all

# Results

In [23]:
ALGOS = [GreedyPolicy, EpsilonGreedyPolicy, RandomPolicy]
ALGOS_NAME = ['Greedy', 'EpsilonGreedy', 'Random']
results = simulate_multiple_users(users, normalized_movie_embeddings, ALGOS, steps=500, n_users=5, s=200)

100%|██████████| 3/3 [00:55<00:00, 18.65s/it]


In [24]:
METRICS = ['cumregret', 'accuracy_rmse', 'avg_rating', 'timediff']
TITLE_GRAPH = [
    'Average cumulative regret for each algorithm',
    'Average RMSE for each algorithm',
    'Average rating for each algorithm',
    'Average running time per step'
]
X_AXIS = [
    'Cumulative Regret',
    'Accuracy (Root Mean Square Error)',
    'Rating',
    'Time (seconds)'
]

assert len(METRICS) == len(TITLE_GRAPH) == len(X_AXIS)


for metric, title, xlabel in zip(METRICS, TITLE_GRAPH, X_AXIS):
    data = []

    for i, algoname in enumerate(ALGOS_NAME):
        metric_matrix = np.array([user_result[metric] for user_result in results[i]])
        avg_metric = np.mean(metric_matrix, axis=0)[1:]

        data.append(go.Scatter(
            x=list(range(1, len(avg_metric) + 1)),
            y=avg_metric,
            mode='lines',
            name=algoname
        ))

    layout = go.Layout(
        title=title,
        xaxis=dict(title='Simulation Step'),
        yaxis=dict(title=xlabel)
    )

    fig = go.Figure(data=data, layout=layout)
    fig.show()