In [1]:
%load_ext autoreload
%autoreload 2

## (1) Load Data

In [2]:
import wget
from zipfile import ZipFile
import pandas as pd

import utils

In [3]:
# movielens_data_file_url = (
#     "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
# )
# data_dir = "./data"

In [4]:
# data_file_path = wget.download(movielens_data_file_url,out=data_dir)
# ZipFile(data_file_path).extractall(data_dir)

In [5]:
df_movie = pd.read_csv("./data/ml-latest-small/movies.csv")

In [6]:
df_movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
df_rating = pd.read_csv("./data/ml-latest-small/ratings.csv")

In [8]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
df_links = pd.read_csv("./data/ml-latest-small/links.csv")

In [10]:
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [11]:
df_tag = pd.read_csv("./data/ml-latest-small/tags.csv")

In [12]:
df_tag.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


## (2) Preprocess Data

#### (1) Remap ID

In [13]:
user_ids = df_rating["userId"].unique().tolist()
unk_user_id = min(user_ids)-1
print("UNK User ID",unk_user_id)

# -1 for unknown user
user_id_map_dict = {x: i+1 for i, x in enumerate(user_ids)}
user_id_map_dict["UNK"]=unk_user_id
user_id_unmap_dict = {i: x for i, x in user_id_map_dict.items()}


movie_ids = df_rating["movieId"].unique().tolist()
unk_movie_id = min(movie_ids)-1
print("UNK Movie ID",unk_movie_id)
movie_id_map_dict = {x: i+1 for i, x in enumerate(movie_ids)}
movie_id_map_dict["UNK"]=unk_movie_id
movie_id_unmap_dict = {i: x for i, x in movie_id_map_dict.items()}

UNK User ID 0
UNK Movie ID 0


In [14]:
utils.save_object("./artifacts/user_id_map_dict.pkl",user_id_map_dict)

utils.save_object("./artifacts/movie_id_map_dict.pkl",movie_id_map_dict)

### (2) Map ID

In [15]:
from sklearn.preprocessing import MinMaxScaler

In [16]:
df_rating["user_embed_id"] = df_rating["userId"].map(user_id_map_dict)
df_rating["movie_embed_id"] = df_rating["movieId"].map(movie_id_map_dict)

In [17]:
num_users = len(user_id_map_dict)
num_movies = len(movie_id_map_dict)

min_rating = min(df_rating["rating"])
max_rating = max(df_rating["rating"])
print(f"min_rating:{min_rating}")
print(f"max_rating:{max_rating}")

min_rating:0.5
max_rating:5.0


In [18]:
min_max_scaler = MinMaxScaler()

In [19]:
df_rating["rating"] = min_max_scaler.fit_transform(
    df_rating["rating"].values.reshape(-1, 1))[:, 0]

In [20]:
utils.save_object("./artifacts/rating_min_max_scaler.pkl",min_max_scaler)

In [21]:
print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 611, Number of Movies: 9725, Min rating: 0.5, Max rating: 5.0


In [22]:
df_rating['rating'] = df_rating['rating']/5

In [23]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,user_embed_id,movie_embed_id
0,1,1,0.155556,964982703,1,1
1,1,3,0.155556,964981247,1,2
2,1,6,0.155556,964982224,1,3
3,1,47,0.2,964983815,1,4
4,1,50,0.2,964982931,1,5


In [24]:
df_movie = df_movie[df_movie['movieId'].isin(set(df_rating['movieId']))]

In [25]:
genres_set = set()
def get_genres_set(genres):
    global genres_set
    genres_split = genres.split("|")
    genres_set.update(genres_split)
    return genres_split

In [26]:
df_movie['genres'] = df_movie['genres'].apply(lambda x:get_genres_set(x))

In [27]:
genres_map_dict ={x: i+1 for i, x in enumerate(sorted(genres_set))}
unk_genres_id = 0
genres_map_dict['UNK']=unk_genres_id
print("UNK Genre ID",unk_genres_id)
genres_unmap_dict ={i: x for i, x in genres_map_dict.items()}

UNK Genre ID 0


In [28]:
utils.save_object("./artifacts/genres_map_dict.pkl",genres_map_dict)

In [29]:
df_movie['genres_embed_ids']= df_movie['genres'].apply(lambda x: [genres_map_dict[g] for g in x])

In [30]:
df_movie

Unnamed: 0,movieId,title,genres,genres_embed_ids
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[3, 4, 5, 6, 10]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[3, 5, 10]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[6, 16]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[6, 9, 16]"
4,5,Father of the Bride Part II (1995),[Comedy],[6]
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]","[2, 4, 6, 10]"
9738,193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]","[4, 6, 10]"
9739,193585,Flint (2017),[Drama],[9]
9740,193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]","[2, 4]"


In [31]:
df_rating = df_rating.merge(df_movie[['movieId','genres_embed_ids']],on=['movieId'])

In [32]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,user_embed_id,movie_embed_id,genres_embed_ids
0,1,1,0.155556,964982703,1,1,"[3, 4, 5, 6, 10]"
1,5,1,0.155556,847434962,5,1,"[3, 4, 5, 6, 10]"
2,7,1,0.177778,1106635946,7,1,"[3, 4, 5, 6, 10]"
3,15,1,0.088889,1510577970,15,1,"[3, 4, 5, 6, 10]"
4,17,1,0.177778,1305696483,17,1,"[3, 4, 5, 6, 10]"


In [33]:
df_processed  = df_rating[['user_embed_id','movie_embed_id','genres_embed_ids','rating']].copy()

In [34]:
df_processed.head()

Unnamed: 0,user_embed_id,movie_embed_id,genres_embed_ids,rating
0,1,1,"[3, 4, 5, 6, 10]",0.155556
1,5,1,"[3, 4, 5, 6, 10]",0.155556
2,7,1,"[3, 4, 5, 6, 10]",0.177778
3,15,1,"[3, 4, 5, 6, 10]",0.088889
4,17,1,"[3, 4, 5, 6, 10]",0.177778


In [35]:
df_processed.to_parquet("./data/processed.parquet")