# A Transformer-based recommendation system


In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os
import math
from zipfile import ZipFile
from urllib.request import urlretrieve
import numpy as np
import pandas as pd
from src import utils
from sklearn.preprocessing import MinMaxScaler

## (1)  Load  data

In [3]:
# urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
# ZipFile("movielens.zip", "r").extractall()

Then, we load the data into pandas DataFrames with their proper column names.

In [4]:
users = pd.read_csv(
    "ml-1m/users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    engine='python'
)

ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    engine='python'
)

movies = pd.read_csv(
    "ml-1m/movies.dat", sep="::", names=["movie_id", "title", "genres"],
    engine='python',encoding='ISO-8859-1'
)

## (2) Remap ID

#### User

In [5]:
users = users.drop("zip_code",axis=1)
users.head()

Unnamed: 0,user_id,sex,age_group,occupation
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15
3,4,M,45,7
4,5,M,25,20


In [6]:
def generate_remap_id_dict(df,col):
    ids = df[df[col].notnull()][col].unique().tolist()
    ids = sorted(ids)
    id_map_dict = {x: i+1 for i, x in enumerate(ids)}
    id_map_dict["UNK"]=0

    df[f"{col}_index"] = df[col].fillna("UNK").map(id_map_dict)

    return id_map_dict

In [7]:
user_id_map_dict = generate_remap_id_dict(users,col='user_id')
utils.save_object("./artifacts/user_id_map_dict.pkl",user_id_map_dict)

In [8]:
users.head()

Unnamed: 0,user_id,sex,age_group,occupation,user_id_index
0,1,F,1,10,1
1,2,M,56,16,2
2,3,M,25,15,3
3,4,M,45,7,4
4,5,M,25,20,5


In [9]:
sex_id_map_dict = {'M':0.0,'F':1.0,'UNK':0.5}
utils.save_object("./artifacts/sex_id_map_dict.pkl",sex_id_map_dict)
users['sex']=users['sex'].map(sex_id_map_dict)

In [10]:
occupation_id_map_dict = generate_remap_id_dict(users,col='occupation')
utils.save_object("./artifacts/occupation_id_map_dict.pkl",occupation_id_map_dict)
# occupation_id_map_dict

In [11]:
age_group_id_map_dict = generate_remap_id_dict(users,col='age_group')
utils.save_object("./artifacts/age_group_id_map_dict.pkl",age_group_id_map_dict)
age_group_id_map_dict

{1: 1, 18: 2, 25: 3, 35: 4, 45: 5, 50: 6, 56: 7, 'UNK': 0}

In [12]:
users.head()

Unnamed: 0,user_id,sex,age_group,occupation,user_id_index,occupation_index,age_group_index
0,1,1.0,1,10,1,11,1
1,2,0.0,56,16,2,17,7
2,3,0.0,25,15,3,16,3
3,4,0.0,45,7,4,8,5
4,5,0.0,25,20,5,21,3


#### Rating

In [13]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [14]:
min_max_scaler = MinMaxScaler()

In [15]:
ratings["norm_rating"] = min_max_scaler.fit_transform(
    ratings["rating"].values.reshape(-1, 1))[:, 0]

In [16]:
utils.save_object("./artifacts/rating_min_max_scaler.pkl",min_max_scaler)

#### Movie

In [17]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
movie_id_id_map_dict = generate_remap_id_dict(movies,col='movie_id')
utils.save_object("./artifacts/movie_id_map_dict.pkl",movie_id_id_map_dict)

In [19]:
genres_set = set()
def get_genres_set(genres):
    global genres_set
    genres_split = genres.split("|")
    genres_set.update(genres_split)
    return genres_split

In [20]:
movies['genres'] = movies['genres'].apply(lambda x:get_genres_set(x))

In [21]:
genres_map_dict ={x: i+1 for i, x in enumerate(sorted(genres_set))}
genres_map_dict['UNK']=0

In [22]:
utils.save_object("./artifacts/genres_map_dict.pkl",genres_map_dict)

In [23]:
movies['genres_ids']= movies['genres'].apply(lambda x: [genres_map_dict[g] for g in x])

In [24]:
ratings['movie_id_index'] = ratings['movie_id'].map(movie_id_id_map_dict) 
ratings['user_id_index'] = ratings['user_id'].map(user_id_map_dict) 

In [25]:
df_user_views = ratings[["movie_id_index", "user_id_index", "unix_timestamp", "norm_rating"]] \
    .merge(movies[['movie_id_index', 'genres_ids']],
           on=['movie_id_index'])
#     .merge(users[['user_id_index', 'sex_index', 'occupation_index', 'age_group_index']],
#            on=['user_id_index']) \

In [26]:
df_user_views.head()

Unnamed: 0,movie_id_index,user_id_index,unix_timestamp,norm_rating,genres_ids
0,1177,1,978300760,1.0,[8]
1,1177,2,978298413,1.0,[8]
2,1177,12,978220179,0.75,[8]
3,1177,15,978199279,0.75,[8]
4,1177,17,978158471,1.0,[8]


## (3) Transorm into sequences

In [27]:
df_agg_ratings = df_user_views.sort_values(by=["unix_timestamp"]).groupby("user_id_index")

In [28]:
ratings_data = pd.DataFrame(
    data={
        "user_id_index": list(df_agg_ratings.groups.keys()),
        "movie_sequence": list(df_agg_ratings.movie_id_index.apply(list)),
        "rating_sequence": list(df_agg_ratings.norm_rating.apply(list)),
        "timestamps": list(df_agg_ratings.unix_timestamp.apply(list)),
    }
)


In [29]:
ratings_data.head()

Unnamed: 0,user_id_index,movie_sequence,rating_sequence,timestamps
0,1,"[3118, 1010, 1673, 1251, 2272, 1769, 3340, 119...","[0.75, 1.0, 0.75, 1.0, 0.5, 1.0, 0.75, 0.75, 1...","[978300019, 978300055, 978300055, 978300055, 9..."
1,2,"[1181, 1200, 1193, 2649, 1274, 2875, 1208, 117...","[0.75, 0.5, 0.75, 0.5, 1.0, 0.75, 1.0, 1.0, 1....","[978298124, 978298151, 978298151, 978298196, 9..."
2,3,"[590, 2790, 3466, 1900, 1893, 1408, 1247, 3603...","[0.5, 0.75, 0.5, 0.75, 0.75, 0.5, 1.0, 1.0, 1....","[978297018, 978297039, 978297068, 978297068, 9..."
3,4,"[1193, 1082, 477, 3459, 3400, 1179, 258, 1181,...","[0.5, 0.75, 0.75, 0.0, 1.0, 0.25, 1.0, 1.0, 1....","[978293924, 978293964, 978294008, 978294008, 9..."
4,5,"[897, 2649, 908, 353, 1231, 1112, 2790, 2120, ...","[0.75, 0.0, 0.75, 0.0, 1.0, 0.0, 0.75, 0.0, 0....","[978241072, 978241072, 978241072, 978241112, 9..."


In [30]:
sequence_length = 4
step_size = 2


def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


ratings_data.movie_sequence	 = ratings_data.movie_sequence.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.rating_sequence = ratings_data.rating_sequence.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

del ratings_data["timestamps"]

In [31]:
ratings_data.head()

Unnamed: 0,user_id_index,movie_sequence,rating_sequence
0,1,"[[3118, 1010, 1673, 1251], [1673, 1251, 2272, ...","[[0.75, 1.0, 0.75, 1.0], [0.75, 1.0, 0.5, 1.0]..."
1,2,"[[1181, 1200, 1193, 2649], [1193, 2649, 1274, ...","[[0.75, 0.5, 0.75, 0.5], [0.75, 0.5, 1.0, 0.75..."
2,3,"[[590, 2790, 3466, 1900], [3466, 1900, 1893, 1...","[[0.5, 0.75, 0.5, 0.75], [0.5, 0.75, 0.75, 0.5..."
3,4,"[[1193, 1082, 477, 3459], [477, 3459, 3400, 11...","[[0.5, 0.75, 0.75, 0.0], [0.75, 0.0, 1.0, 0.25..."
4,5,"[[897, 2649, 908, 353], [908, 353, 1231, 1112]...","[[0.75, 0.0, 0.75, 0.0], [0.75, 0.0, 1.0, 0.0]..."


In [32]:
ratings_data_movies = ratings_data[["user_id_index", "movie_sequence"]].explode(
    "movie_sequence", ignore_index=True
)
ratings_data_rating = ratings_data[["rating_sequence"]].explode("rating_sequence", ignore_index=True)
ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)

In [33]:
ratings_data_transformed.head()

Unnamed: 0,user_id_index,movie_sequence,rating_sequence
0,1,"[3118, 1010, 1673, 1251]","[0.75, 1.0, 0.75, 1.0]"
1,1,"[1673, 1251, 2272, 1769]","[0.75, 1.0, 0.5, 1.0]"
2,1,"[2272, 1769, 3340, 1190]","[0.5, 1.0, 0.75, 0.75]"
3,1,"[3340, 1190, 2736, 258]","[0.75, 0.75, 1.0, 0.75]"
4,1,"[2736, 258, 1177, 712]","[1.0, 0.75, 1.0, 0.5]"


In [34]:
user_columns = ['user_id_index',
                'sex',
                'occupation_index',
                'age_group_index']

In [35]:
ratings_data_transformed = ratings_data_transformed.merge(
    users[user_columns], on="user_id_index"
)

In [36]:
ratings_data_transformed['sex'] = ratings_data_transformed['sex'].astype(float)

In [37]:
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85
train_data = ratings_data_transformed[random_selection]
test_data = ratings_data_transformed[~random_selection]

In [38]:
train_data.to_parquet("./artifacts/train_data.parquet")
test_data.to_parquet("./artifacts/test_data.parquet")