# A Behavior Sequence Transformer For Next Movie Recommendation

**Author:** [Nelson Lin](https://www.linkedin.com/in/nelson-lin-842564164/)<br>
**Date created:** 2024/07/09<br>
**Last modified:** 2024/07/09<br>
**Description:** Rating rate prediction using the Behavior Sequence Transformer (BST) model on the Movielens 1M.


## (1) Notebook Settings


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import numpy as np
from zipfile import ZipFile
from urllib.request import urlretrieve
from sklearn.preprocessing import MinMaxScaler
import random

os.chdir("./../")

In [3]:
from src import utils

In [4]:
# Set random seed for reproducibility
np.random.seed(0)
random.seed(0)

## (2) Data preparation


### Load Data


In [5]:
urlretrieve(
    "http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
ZipFile("movielens.zip", "r").extractall()

In [6]:
users = pd.read_csv(
    "ml-1m/users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    engine="python",
)

ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    engine="python",
)

movies = pd.read_csv(
    "ml-1m/movies.dat",
    sep="::",
    names=["movie_id", "title", "genres"],
    engine="python",
    encoding="ISO-8859-1",
)

### Remap ID to index


In [7]:
# remap the id to index
def generate_remap_id_dict(df, col):
    ids = df[df[col].notnull()][col].unique().tolist()
    ids = sorted(ids)
    id_map_dict = {x: i + 1 for i, x in enumerate(ids)}
    id_map_dict["UNK"] = 0

    df[f"{col}_index"] = df[col].fillna("UNK").map(id_map_dict)

    return id_map_dict

In [8]:
# Sex
sex_id_map_dict = {"M": 0.0, "F": 1.0, "UNK": 0.5}
utils.save_object("artifacts/sex_id_map_dict.pkl", sex_id_map_dict)

users["sex"] = users["sex"].map(sex_id_map_dict)

In [9]:
# Age
age_group_id_map_dict = generate_remap_id_dict(users, col="age_group")
utils.save_object("artifacts/age_group_id_map_dict.pkl", age_group_id_map_dict)

In [10]:
# Rating
min_max_scaler = MinMaxScaler()
ratings["norm_rating"] = min_max_scaler.fit_transform(
    ratings["rating"].values.reshape(-1, 1)
)[:, 0]
utils.save_object("artifacts/rating_min_max_scaler.pkl", min_max_scaler)

In [11]:
# Movie
movie_id_map_dict = generate_remap_id_dict(movies, col="movie_id")
utils.save_object("artifacts/movie_id_map_dict.pkl", movie_id_map_dict)

In [12]:
# Genres
genres_set = set()


def get_genres_set(genres):
    global genres_set
    genres_split = genres.split("|")
    genres_set.update(genres_split)
    return genres_split


movies["genres"] = movies["genres"].apply(lambda x: get_genres_set(x))
genres_map_dict = {x: i + 1 for i, x in enumerate(sorted(genres_set))}
genres_map_dict["UNK"] = 0
utils.save_object("artifacts/genres_map_dict.pkl", genres_map_dict)

In [13]:
movies.head()

Unnamed: 0,movie_id,title,genres,movie_id_index
0,1,Toy Story (1995),"[Animation, Children's, Comedy]",1
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]",2
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",3
3,4,Waiting to Exhale (1995),"[Comedy, Drama]",4
4,5,Father of the Bride Part II (1995),[Comedy],5


In [14]:
movies["genres_ids"] = movies["genres"].apply(
    lambda x: [genres_map_dict[g] for g in x])

In [15]:
max_genres_length = 4
# padding genres
movies["genres_ids"] = movies["genres_ids"].apply(
    lambda x: (x + [genres_map_dict["UNK"]] *
               max_genres_length)[:max_genres_length]
)

In [16]:
movies_to_genres_dict = movies[['movie_id_index', 'genres_ids']] \
    .set_index("movie_id_index")['genres_ids'].to_dict()
utils.save_object("./artifacts/movies_to_genres_dict.pkl",
                  movies_to_genres_dict,)

In [17]:
ratings["movie_id_index"] = ratings["movie_id"].map(movie_id_map_dict)

In [18]:
df_user_views = ratings[
    ["user_id", "movie_id_index", "norm_rating", "unix_timestamp"]
].merge(movies[["movie_id_index", "genres_ids"]], on=["movie_id_index"])

In [19]:
df_user_views.head()

Unnamed: 0,user_id,movie_id_index,norm_rating,unix_timestamp,genres_ids
0,1,1177,1.0,978300760,"[8, 0, 0, 0]"
1,2,1177,1.0,978298413,"[8, 0, 0, 0]"
2,12,1177,0.75,978220179,"[8, 0, 0, 0]"
3,15,1177,0.75,978199279,"[8, 0, 0, 0]"
4,17,1177,1.0,978158471,"[8, 0, 0, 0]"


### Prepare the Sequence


In [20]:
df_agg = df_user_views.sort_values(by=["unix_timestamp"]).groupby("user_id")

In [21]:
sequences = pd.DataFrame(
    data={
        "user_id": list(df_agg.groups.keys()),
        "movie_sequence": list(df_agg.movie_id_index.apply(list)),
        "genres_ids_sequence": list(df_agg.genres_ids.apply(list)),
        "rating_sequence": list(df_agg.norm_rating.apply(list)),
    }
)

In [22]:
sequences.head()

Unnamed: 0,user_id,movie_sequence,genres_ids_sequence,rating_sequence
0,1,"[3118, 1010, 1673, 1251, 2272, 1769, 3340, 119...","[[8, 0, 0, 0], [3, 4, 12, 0], [8, 14, 0, 0], [...","[0.75, 1.0, 0.75, 1.0, 0.5, 1.0, 0.75, 0.75, 1..."
1,2,"[1181, 1200, 1193, 2649, 1274, 2875, 1208, 117...","[[1, 2, 0, 0], [8, 17, 0, 0], [1, 2, 14, 15], ...","[0.75, 0.5, 0.75, 0.5, 1.0, 0.75, 1.0, 1.0, 1...."
2,3,"[590, 2790, 3466, 1900, 1893, 1408, 1247, 3603...","[[8, 16, 0, 0], [5, 8, 0, 0], [5, 0, 0, 0], [5...","[0.5, 0.75, 0.5, 0.75, 0.75, 0.5, 1.0, 1.0, 1...."
3,4,"[1193, 1082, 477, 3459, 3400, 1179, 258, 1181,...","[[1, 2, 14, 15], [4, 8, 9, 15], [1, 2, 15, 0],...","[0.5, 0.75, 0.75, 0.0, 1.0, 0.25, 1.0, 1.0, 1...."
4,5,"[897, 2649, 908, 353, 1231, 1112, 2790, 2120, ...","[[8, 16, 0, 0], [5, 11, 0, 0], [2, 4, 8, 12], ...","[0.75, 0.0, 0.75, 0.0, 1.0, 0.0, 0.75, 0.0, 0...."


In [23]:
def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            # seq = values[-window_size:]
            # if len(seq) == window_size:
            #     sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences

In [24]:
def generate_sequence_data(input_sequences_data, sequence_length=2):
    step_size = 1
    output_sequences_data = input_sequences_data.copy()
    output_sequences_data.movie_sequence = output_sequences_data.movie_sequence.apply(
        lambda ids: create_sequences(ids, sequence_length, step_size)
    )
    output_sequences_data.genres_ids_sequence = (
        output_sequences_data.genres_ids_sequence.apply(
            lambda ids: create_sequences(ids, sequence_length, step_size)
        )
    )

    output_sequences_data.rating_sequence = output_sequences_data.rating_sequence.apply(
        lambda ids: create_sequences(ids, sequence_length, step_size)
    )

    return output_sequences_data

In [25]:
sequence_length = 6
sequence_lengths = range(2, sequence_length+1)

In [26]:
df_list = [
    generate_sequence_data(sequences, sequence_length)
    for sequence_length in sequence_lengths
]
multi_sequence = pd.concat(df_list)
multi_sequence.head()

Unnamed: 0,user_id,movie_sequence,genres_ids_sequence,rating_sequence
0,1,"[[3118, 1010], [1010, 1673], [1673, 1251], [12...","[[[8, 0, 0, 0], [3, 4, 12, 0]], [[3, 4, 12, 0]...","[[0.75, 1.0], [1.0, 0.75], [0.75, 1.0], [1.0, ..."
1,2,"[[1181, 1200], [1200, 1193], [1193, 2649], [26...","[[[1, 2, 0, 0], [8, 17, 0, 0]], [[8, 17, 0, 0]...","[[0.75, 0.5], [0.5, 0.75], [0.75, 0.5], [0.5, ..."
2,3,"[[590, 2790], [2790, 3466], [3466, 1900], [190...","[[[8, 16, 0, 0], [5, 8, 0, 0]], [[5, 8, 0, 0],...","[[0.5, 0.75], [0.75, 0.5], [0.5, 0.75], [0.75,..."
3,4,"[[1193, 1082], [1082, 477], [477, 3459], [3459...","[[[1, 2, 14, 15], [4, 8, 9, 15]], [[4, 8, 9, 1...","[[0.5, 0.75], [0.75, 0.75], [0.75, 0.0], [0.0,..."
4,5,"[[897, 2649], [2649, 908], [908, 353], [353, 1...","[[[8, 16, 0, 0], [5, 11, 0, 0]], [[5, 11, 0, 0...","[[0.75, 0.0], [0.0, 0.75], [0.75, 0.0], [0.0, ..."


In [27]:
multi_sequence_movies = multi_sequence[["user_id", "movie_sequence"]].explode(
    "movie_sequence", ignore_index=True
)

multi_sequence_rating = multi_sequence[["rating_sequence"]].explode(
    "rating_sequence", ignore_index=True
)

multi_sequence_genres = multi_sequence[["genres_ids_sequence"]].explode(
    "genres_ids_sequence", ignore_index=True
)

multi_sequence_transformed = pd.concat(
    [multi_sequence_movies, multi_sequence_rating, multi_sequence_genres], axis=1
)

In [28]:
multi_sequence_transformed.head()

Unnamed: 0,user_id,movie_sequence,rating_sequence,genres_ids_sequence
0,1,"[3118, 1010]","[0.75, 1.0]","[[8, 0, 0, 0], [3, 4, 12, 0]]"
1,1,"[1010, 1673]","[1.0, 0.75]","[[3, 4, 12, 0], [8, 14, 0, 0]]"
2,1,"[1673, 1251]","[0.75, 1.0]","[[8, 14, 0, 0], [5, 15, 0, 0]]"
3,1,"[1251, 2272]","[1.0, 0.5]","[[5, 15, 0, 0], [14, 0, 0, 0]]"
4,1,"[2272, 1769]","[0.5, 1.0]","[[14, 0, 0, 0], [8, 0, 0, 0]]"


In [29]:
multi_sequence_transformed.tail()

Unnamed: 0,user_id,movie_sequence,rating_sequence,genres_ids_sequence
4910440,6040,"[2657, 1628, 454, 3603, 230, 2849]","[0.75, 0.75, 0.75, 0.75, 1.0, 0.75]","[[8, 0, 0, 0], [8, 0, 0, 0], [1, 16, 0, 0], [5..."
4910441,6040,"[1628, 454, 3603, 230, 2849, 1853]","[0.75, 0.75, 0.75, 1.0, 0.75, 0.75]","[[8, 0, 0, 0], [1, 16, 0, 0], [5, 18, 0, 0], [..."
4910442,6040,"[454, 3603, 230, 2849, 1853, 1727]","[0.75, 0.75, 1.0, 0.75, 0.75, 0.5]","[[1, 16, 0, 0], [5, 18, 0, 0], [5, 8, 0, 0], [..."
4910443,6040,"[3603, 230, 2849, 1853, 1727, 160]","[0.75, 1.0, 0.75, 0.75, 0.5, 0.5]","[[5, 18, 0, 0], [5, 8, 0, 0], [6, 16, 0, 0], [..."
4910444,6040,"[230, 2849, 1853, 1727, 160, 1204]","[1.0, 0.75, 0.75, 0.5, 0.5, 0.75]","[[5, 8, 0, 0], [6, 16, 0, 0], [15, 16, 0, 0], ..."


In [30]:
multi_sequence_transformed = multi_sequence_transformed[
    multi_sequence_transformed["movie_sequence"].notnull()
]

In [31]:
user_columns = ["user_id", "sex", "age_group_index"]

In [32]:
multi_sequence_transformed = multi_sequence_transformed.merge(
    users[user_columns], on="user_id"
)

In [33]:
multi_sequence_transformed["sex"] = multi_sequence_transformed["sex"].astype(
    float)

In [34]:
multi_sequence_transformed.head()

Unnamed: 0,user_id,movie_sequence,rating_sequence,genres_ids_sequence,sex,age_group_index
0,1,"[3118, 1010]","[0.75, 1.0]","[[8, 0, 0, 0], [3, 4, 12, 0]]",1.0,1
1,1,"[1010, 1673]","[1.0, 0.75]","[[3, 4, 12, 0], [8, 14, 0, 0]]",1.0,1
2,1,"[1673, 1251]","[0.75, 1.0]","[[8, 14, 0, 0], [5, 15, 0, 0]]",1.0,1
3,1,"[1251, 2272]","[1.0, 0.5]","[[5, 15, 0, 0], [14, 0, 0, 0]]",1.0,1
4,1,"[2272, 1769]","[0.5, 1.0]","[[14, 0, 0, 0], [8, 0, 0, 0]]",1.0,1


### Assign Rating


In [35]:
multi_sequence_transformed["target_movie"] = multi_sequence_transformed[
    "movie_sequence"
].apply(lambda x: x[-1])
multi_sequence_transformed["target_rating"] = multi_sequence_transformed[
    "rating_sequence"
].apply(lambda x: x[-1])

In [36]:
# Assume that we don't have rating input from users in inference
multi_sequence_transformed = multi_sequence_transformed.drop(
    "rating_sequence", axis=1)

In [37]:
multi_sequence_transformed.head()

Unnamed: 0,user_id,movie_sequence,genres_ids_sequence,sex,age_group_index,target_movie,target_rating
0,1,"[3118, 1010]","[[8, 0, 0, 0], [3, 4, 12, 0]]",1.0,1,1010,1.0
1,1,"[1010, 1673]","[[3, 4, 12, 0], [8, 14, 0, 0]]",1.0,1,1673,0.75
2,1,"[1673, 1251]","[[8, 14, 0, 0], [5, 15, 0, 0]]",1.0,1,1251,1.0
3,1,"[1251, 2272]","[[5, 15, 0, 0], [14, 0, 0, 0]]",1.0,1,2272,0.5
4,1,"[2272, 1769]","[[14, 0, 0, 0], [8, 0, 0, 0]]",1.0,1,1769,1.0


### Padding Sequence


In [38]:
# padding sequence
max_length = max([len(seq)
                 for seq in multi_sequence_transformed["movie_sequence"]])

In [39]:
multi_sequence_transformed["movie_sequence"] = multi_sequence_transformed[
    "movie_sequence"
].apply(lambda x: x + max_length * [movie_id_map_dict["UNK"]])
multi_sequence_transformed["movie_sequence"] = multi_sequence_transformed[
    "movie_sequence"
].apply(lambda x: x[:max_length])

In [40]:
def padding_genres_id(genres_ids_sequence):
    padding_list = [genres_map_dict["UNK"]] * max_genres_length
    for _ in range(max_length):
        genres_ids_sequence.append(padding_list)
    return genres_ids_sequence[:max_length]

In [41]:
multi_sequence_transformed["genres_ids_sequence"] = multi_sequence_transformed[
    "genres_ids_sequence"
].apply(lambda x: padding_genres_id(x))

In [42]:
multi_sequence_transformed.head()

Unnamed: 0,user_id,movie_sequence,genres_ids_sequence,sex,age_group_index,target_movie,target_rating
0,1,"[3118, 1010, 0, 0, 0, 0]","[[8, 0, 0, 0], [3, 4, 12, 0], [0, 0, 0, 0], [0...",1.0,1,1010,1.0
1,1,"[1010, 1673, 0, 0, 0, 0]","[[3, 4, 12, 0], [8, 14, 0, 0], [0, 0, 0, 0], [...",1.0,1,1673,0.75
2,1,"[1673, 1251, 0, 0, 0, 0]","[[8, 14, 0, 0], [5, 15, 0, 0], [0, 0, 0, 0], [...",1.0,1,1251,1.0
3,1,"[1251, 2272, 0, 0, 0, 0]","[[5, 15, 0, 0], [14, 0, 0, 0], [0, 0, 0, 0], [...",1.0,1,2272,0.5
4,1,"[2272, 1769, 0, 0, 0, 0]","[[14, 0, 0, 0], [8, 0, 0, 0], [0, 0, 0, 0], [0...",1.0,1,1769,1.0


In [43]:
random_selection = np.random.rand(
    len(multi_sequence_transformed.index)) <= 0.85
train_data = multi_sequence_transformed[random_selection]
test_data = multi_sequence_transformed[~random_selection]

## (3) Save


In [44]:
# Assume that users are anonymous in inference
train_data = train_data.drop("user_id", axis=1)
test_data = test_data.drop("user_id", axis=1)

In [45]:
train_data.to_parquet("artifacts/train_data.parquet")
test_data.to_parquet("artifacts/test_data.parquet")