# Reinforcement Learning for Movie Recommendations with Learned Embeddings

This notebook implements a content-based movie recommender system using reinforcement learning (multi-armed bandits), building on the ideas introduced in the paper: "Reinforcement Learning Approaches to Movies Recommendation" by Carpentier et al (2015).

This modernises the original approach by replacing PCA-engineered features with deep learned embeddings.

We train a neural autoencoder to generate embeddings of movies based on their genres and release year. These embeddings are used in a contextual bandit setting to simulate personalised movie recommendations for new users.

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go


import plotly.io as pio
pio.renderers.default = 'colab'

import os
import random
import math
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from scipy.stats import spearmanr
import datetime

%matplotlib inline
from multiprocessing import Pool, cpu_count
from functools import partial


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import mean_squared_error


from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, MinMaxScaler
from sklearn.preprocessing import normalize

In [None]:
SELECTED_DATA_DIR = "../RL-movie-recommender-master/selected-data/"
MOVIES_FILE = "best_movie_ratings_features.csv"
USERS_FILE = "users_ratings.csv"

In [None]:
movies = pd.read_csv(SELECTED_DATA_DIR + MOVIES_FILE, index_col=0)
movies.head()

Unnamed: 0_level_0,aka,genres,year,votes,rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The Shawshank Redemption (1994),"['Die Verurteilten (1995)', 'Die Verurteilten ...","['Crime', 'Drama']",[1994],1740070,9.3
The Dark Knight (2008),"['Batman Begins 2 (2005)', 'Batman: The Dark K...","['Action', 'Crime', 'Drama', 'Thriller']",[2008],1724382,9.0
Inception (2010),"['Inception: The IMAX Experience (2010)', ""Oli...","['Action', 'Adventure', 'Sci-Fi', 'Thriller']",[2010],1515276,8.8
Fight Club (1999),"['Fight Club (1999)', 'Fight Club (1999)', 'Fi...",['Drama'],[1999],1389487,8.8
Pulp Fiction (1994),"['Black Mask (1994)', 'Pulp Fiction (1994)', '...","['Crime', 'Drama']",[1994],1362518,8.9


In [None]:
users = pd.read_csv(SELECTED_DATA_DIR + USERS_FILE, index_col=0)
users.head()

Unnamed: 0_level_0,user,rating,link
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Pianist (2002),59957513,9,tt0253474
Schindler's List (1993),59957513,9,tt0108052
Reservoir Dogs (1992),59957513,7,tt0105236
Captain Phillips (2013),59957513,8,tt1535109
Goodfellas (1990),59957513,8,tt0099685


# Learning Dense Movie Representations with an Autoencoder

In [None]:
# Preprocess movie features for autoencoder
# Genre encoding
genre_mlb = MultiLabelBinarizer()
genre_features = genre_mlb.fit_transform(movies['genres'])

# Year encoding
year_encoder = OneHotEncoder(sparse_output=False)
year_features = year_encoder.fit_transform(movies[['year']])

# Final input
import numpy as np
X = np.hstack([genre_features, year_features])

X.shape

(1000, 71)

Feature Vector: 71-Dimensional Encoding of Genre and Year

In [None]:
len(genre_mlb.classes_)

35

In [None]:
len(year_encoder.categories_[0])

36

### Autoencoder Architecture

In [None]:
class MovieAutoencoder(nn.Module):
    def __init__(self, input_dim, embedding_dim=16):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 48),
            nn.ReLU(),
            nn.Linear(48, embedding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 48),
            nn.ReLU(),
            nn.Linear(48, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z), z

### Train Autoencoder

In [None]:
X_tensor = torch.tensor(X, dtype=torch.float32)
dataset = TensorDataset(X_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

model = MovieAutoencoder(input_dim=X.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(50):
    for batch in dataloader:
        inputs = batch[0]
        outputs, _ = model(inputs)
        loss = criterion(outputs, inputs)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 0.2208
Epoch 2, Loss: 0.1387
Epoch 3, Loss: 0.0937
Epoch 4, Loss: 0.0928
Epoch 5, Loss: 0.0927
Epoch 6, Loss: 0.0899
Epoch 7, Loss: 0.0831
Epoch 8, Loss: 0.0799
Epoch 9, Loss: 0.0658
Epoch 10, Loss: 0.0605
Epoch 11, Loss: 0.0617
Epoch 12, Loss: 0.0530
Epoch 13, Loss: 0.0545
Epoch 14, Loss: 0.0522
Epoch 15, Loss: 0.0534
Epoch 16, Loss: 0.0480
Epoch 17, Loss: 0.0515
Epoch 18, Loss: 0.0396
Epoch 19, Loss: 0.0485
Epoch 20, Loss: 0.0481
Epoch 21, Loss: 0.0428
Epoch 22, Loss: 0.0339
Epoch 23, Loss: 0.0352
Epoch 24, Loss: 0.0354
Epoch 25, Loss: 0.0301
Epoch 26, Loss: 0.0322
Epoch 27, Loss: 0.0339
Epoch 28, Loss: 0.0260
Epoch 29, Loss: 0.0258
Epoch 30, Loss: 0.0298
Epoch 31, Loss: 0.0295
Epoch 32, Loss: 0.0329
Epoch 33, Loss: 0.0254
Epoch 34, Loss: 0.0224
Epoch 35, Loss: 0.0246
Epoch 36, Loss: 0.0224
Epoch 37, Loss: 0.0217
Epoch 38, Loss: 0.0205
Epoch 39, Loss: 0.0257
Epoch 40, Loss: 0.0187
Epoch 41, Loss: 0.0182
Epoch 42, Loss: 0.0178
Epoch 43, Loss: 0.0249
Epoch 44, Loss: 0.02

### Save model

In [None]:
torch.save(model, "movie_autoencoder_full.pt")

### Generate embeddings for all movies

In [None]:
with torch.no_grad():
    _, embeddings = model(X_tensor)
    movie_embeddings = {
        title: embeddings[i].numpy()
        for i, title in enumerate(movies.index)
    }

In [None]:
movie_embeddings['The Shawshank Redemption (1994)']

array([ 1.4138584, -5.534877 , -3.739904 ,  4.167938 ,  0.5939298,
        1.6964242,  5.350576 ,  6.4477005, -3.753176 ,  1.8134108,
        3.6917515, -4.9319935, -0.996657 ,  4.0917816,  5.465759 ,
        7.631513 ], dtype=float32)

In [None]:
for title, vec in list(movie_embeddings.items())[:3]:
    print(f"{title}: L2 norm = {np.linalg.norm(vec):.4f}")

The Shawshank Redemption (1994): L2 norm = 17.2801
The Dark Knight (2008): L2 norm = 24.6561
Inception (2010): L2 norm = 22.8288


### Normalize movie embeddings (L2 norm)

In [None]:
embedding_matrix = np.array(list(movie_embeddings.values()))
embedding_matrix = normalize(embedding_matrix, axis=1)

normalized_movie_embeddings = {
    title: vec for title, vec in zip(movie_embeddings.keys(), embedding_matrix)
}

normalized_movie_embeddings['The Shawshank Redemption (1994)']

array([ 0.08181982, -0.32030267, -0.2164278 ,  0.2411981 ,  0.03437065,
        0.09817187,  0.3096372 ,  0.37312767, -0.21719585,  0.10494187,
        0.21364123, -0.2854139 , -0.05767642,  0.23679093,  0.3163028 ,
        0.44163477], dtype=float32)

In [None]:
for title, vec in list(normalized_movie_embeddings.items())[:3]:
    print(f"{title}: L2 norm = {np.linalg.norm(vec):.4f}")

The Shawshank Redemption (1994): L2 norm = 1.0000
The Dark Knight (2008): L2 norm = 1.0000
Inception (2010): L2 norm = 1.0000


In [None]:
normalized_movie_embeddings = pd.DataFrame.from_dict(
    normalized_movie_embeddings, orient='index',
    columns=[f'x{i}' for i in range(16)]
)
normalized_movie_embeddings

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15
The Shawshank Redemption (1994),0.081820,-0.320303,-0.216428,0.241198,0.034371,0.098172,0.309637,0.373128,-0.217196,0.104942,0.213641,-0.285414,-0.057676,0.236791,0.316303,0.441635
The Dark Knight (2008),0.179039,-0.168844,-0.176339,0.371089,0.206871,-0.039613,0.302651,0.400184,-0.285295,0.181870,-0.084015,-0.209560,0.034364,0.279645,0.299243,0.374369
Inception (2010),0.256869,0.004948,-0.109741,0.377101,0.344873,-0.250197,0.220187,0.208559,-0.275073,0.195272,-0.259139,-0.163222,0.067478,0.425267,0.271437,0.198888
Fight Club (1999),0.039597,-0.227087,-0.211103,0.209057,0.032425,0.070399,0.235621,0.369699,-0.148829,0.001086,0.441507,-0.210567,-0.223000,0.256122,0.286702,0.448830
Pulp Fiction (1994),0.081820,-0.320303,-0.216428,0.241198,0.034371,0.098172,0.309637,0.373128,-0.217196,0.104942,0.213641,-0.285414,-0.057676,0.236791,0.316303,0.441635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Scream 2 (1997),0.065793,-0.317954,-0.022499,0.110041,0.014626,-0.300343,0.287032,0.166928,-0.269440,-0.176478,0.123221,-0.236143,0.029926,0.454975,0.356453,0.414298
Clueless (1995),0.048251,-0.348433,-0.332352,0.154147,0.051598,-0.133678,0.258895,0.172889,-0.401790,0.264432,0.251239,-0.481368,0.096725,0.197358,0.118474,0.189895
The Lovely Bones (2009),0.155496,-0.192841,-0.010846,0.406018,0.142323,-0.130992,0.327844,0.368898,-0.143443,0.040418,0.010051,-0.189218,0.107508,0.332590,0.325338,0.454477
27 Dresses (2008),0.050276,-0.345543,-0.338669,0.149476,0.040133,-0.134041,0.259490,0.170497,-0.410339,0.272010,0.255639,-0.478186,0.093664,0.190233,0.103648,0.181564


# Content-Based Reinforcement Learning with Learned Movie Embeddings

In [None]:
# --- UTILITY FUNCTIONS ---
def compute_utility(user_features, movie_features, epoch, s):
    return user_features.dot(movie_features) * (1 - math.exp(-epoch / s))

def compute_UCB(epoch, Nt):
    return math.sqrt((2 * math.log2(epoch + 1)) / (Nt * epoch)) if Nt > 0 else 0

def get_movie_features(title):
    return normalized_movie_embeddings.loc[title].values.astype(np.float32)

def iterative_mean(old, new, t):
    return ((t - 1) / t) * old + (1 / t) * new

def update_features(user_features, movie_features, rating, t):
    return iterative_mean(user_features, movie_features * rating, t + 1)

# --- POLICY CLASSES ---
class Algorithm:
    def update_features(self, user_features, movie_features, rating, t):
        return update_features(user_features, movie_features, rating, t)

    def compute_utility(self, user_features, movie_features, epoch, s):
        return compute_utility(user_features, movie_features, epoch, s)

class RandomPolicy(Algorithm):
    def choice(self, user_features, movies, epoch, s):
        return movies.sample(1)

class GreedyPolicy(Algorithm):
    def choice(self, user_features, movies, epoch, s):
        return best_contentbased_recommendation(user_features, movies, epoch, s)

class EpsilonGreedyPolicy(Algorithm):
    def __init__(self, epsilon=0.3):
        self.epsilon = epsilon

    def choice(self, user_features, movies, epoch, s):
        if random.random() < self.epsilon:
            return movies.sample(1)
        else:
            return best_contentbased_recommendation(user_features, movies, epoch, s)

In [None]:
# --- RECOMMENDATION LOGIC ---
def best_contentbased_recommendation(user_features, movies, epoch, s):
    utilities = np.zeros(len(movies))
    for i, (title, movie) in enumerate(movies.iterrows()):
        features = get_movie_features(title)
        utilities[i] = compute_utility(user_features, features, epoch - movie.last_t, s)
        utilities[i] += compute_UCB(epoch, movie.Nt)
    return movies.iloc[[utilities.argmax()]]

In [None]:
# --- SIMULATION ---
def reinforcement_learning(user, movies, algorithm, s, steps):
    algo = algorithm() if callable(algorithm) else algorithm
    user_features = np.zeros(normalized_movie_embeddings.shape[1])

    movies_sim = movies.copy()
    movies_sim['last_t'] = 1
    movies_sim['t'] = range(len(movies_sim))
    movies_sim['rating'] = user['rating']
    movies_sim['Nt'] = 0

    cumregret, accuracy_rmse, avg_rating, timestamps = [0], [0], [0], []
    watched_titles = set()  # NEW: track watched movies

    for t in range(steps):
        start = datetime.datetime.now()

        rec = algo.choice(user_features, movies_sim, t + 1, s)
        rec_title = rec.index[0]
        rec_feat = get_movie_features(rec_title)

        actual_rating = user.loc[rec_title, 'rating']
        user_features = algo.update_features(user_features, rec_feat, actual_rating, t)
        utility = algo.compute_utility(user_features, rec_feat, t + 1, s)

        watched_titles.add(rec_title)

        unwatched = user[~user.index.isin(watched_titles)]
        if len(unwatched) > 0:
            best_possible_rating = unwatched['rating'].max()
        else:
            best_possible_rating = actual_rating

        true_regret = best_possible_rating - actual_rating

        cumregret.append(cumregret[-1] + true_regret)
        accuracy_rmse.append((actual_rating - utility) ** 2)
        avg_rating.append(actual_rating)

        movies_sim.loc[rec.index, 'last_t'] = t
        movies_sim.loc[rec.index, 'Nt'] += 1
        timestamps.append((datetime.datetime.now() - start).total_seconds())

    return {
        'cumregret': cumregret,
        'accuracy_rmse': accuracy_rmse,
        'avg_rating': avg_rating,
        'timediff': timestamps
    }

In [None]:
# --- MULTI-USER SIMULATION ---
def simulate_multiple_users(users_df, movies_df, algorithms, steps=20, n_users=10, s=200):
    results_all = []
    users_sample = users_df[users_df.user.isin(
        pd.Series(users_df.user.unique()).sample(n_users, random_state=42)
    )]

    movies_sample = movies_df.loc[users_sample.index.unique()]

    for algo in tqdm(algorithms):
        algo_results = []
        for user_id in users_sample.user.unique():
            user = users_sample[users_sample.user == user_id]
            movies_user = movies_sample.loc[movies_sample.index.isin(user.index)]
            res = reinforcement_learning(user, movies_user, algo, s, steps)
            algo_results.append(res)
        results_all.append(algo_results)

    return results_all

# Results

In [None]:
ALGOS = [GreedyPolicy, EpsilonGreedyPolicy, RandomPolicy]
ALGOS_NAME = ['Greedy', 'EpsilonGreedy', 'Random']
results = simulate_multiple_users(users, normalized_movie_embeddings, ALGOS, steps=500, n_users=5, s=200)

100%|██████████| 3/3 [00:48<00:00, 16.24s/it]


In [None]:
METRICS = ['cumregret', 'accuracy_rmse', 'avg_rating', 'timediff']
TITLE_GRAPH = [
    'Average cumulative regret for each algorithm',
    'Average RMSE for each algorithm',
    'Average rating for each algorithm',
    'Average running time per step'
]
X_AXIS = [
    'Cumulative Regret',
    'Accuracy (Root Mean Square Error)',
    'Rating',
    'Time (seconds)'
]

assert len(METRICS) == len(TITLE_GRAPH) == len(X_AXIS)


for metric, title, xlabel in zip(METRICS, TITLE_GRAPH, X_AXIS):
    data = []

    for i, algoname in enumerate(ALGOS_NAME):
        metric_matrix = np.array([user_result[metric] for user_result in results[i]])
        avg_metric = np.mean(metric_matrix, axis=0)[1:]

        data.append(go.Scatter(
            x=list(range(1, len(avg_metric) + 1)),
            y=avg_metric,
            mode='lines',
            name=algoname
        ))

    layout = go.Layout(
        title=title,
        xaxis=dict(title='Simulation Step'),
        yaxis=dict(title=xlabel)
    )

    fig = go.Figure(data=data, layout=layout)
    fig.show()