# Imports

In [11]:
from sentence_transformers import SentenceTransformer
import tensorflow_datasets as tfds

# Load Df

In [12]:
df = tfds.load('movielens/100k-ratings', split='train')
df = tfds.as_dataframe(df)

df = df.head(10000)
df.head()

2025-07-27 21:31:04.658416: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,raw_user_age,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code
0,45.0,[7],b'357',"b""One Flew Over the Cuckoo's Nest (1975)""",46.0,879024327,True,b'138',4,b'doctor',4.0,b'53211'
1,25.0,"[4, 14]",b'709',b'Strictly Ballroom (1992)',32.0,875654590,True,b'92',5,b'entertainment',2.0,b'80525'
2,18.0,[4],b'412',"b'Very Brady Sequel, A (1996)'",24.0,882075110,True,b'301',17,b'student',4.0,b'55439'
3,50.0,"[5, 7]",b'56',b'Pulp Fiction (1994)',50.0,883326919,True,b'60',4,b'healthcare',4.0,b'06472'
4,50.0,"[10, 16]",b'895',b'Scream 2 (1997)',55.0,891409199,True,b'197',18,b'technician',3.0,b'75094'


## Processing

In [13]:
df.drop(['bucketized_user_age','movie_id','raw_user_age','user_occupation_text','timestamp','user_gender','user_occupation_label','user_zip_code'], axis=1,inplace=True)

df['user_id'] = df['user_id'].apply(lambda x: x.decode('utf-8')) # retirar o b'' das string
df['user_rating'] = df['user_rating'] / 10 # normalizar entre 0 e 1


In [14]:
genre_dict = {
    0: 'Action',
    1: 'Drama',
    2: 'Comedy',
    3: 'Thriller',
    4: 'Romance',
    5: 'Sci-Fi',
    6: 'Fantasy',
    7: 'Documentary',
    8: 'Horror',
    9: 'Adventure',
    10: 'Animation',
    11: 'Biography',
    12: 'Crime',
    13: 'Family',
    14: 'History',
    15: 'Music',
    16: 'Mystery',
    17: 'Sport',
    18: 'War',
    19: 'Western'
}

df['movie_genres'] = df['movie_genres'].apply(lambda x: [genre_dict[i] for i in x])
df.head(10)


Unnamed: 0,movie_genres,movie_title,user_id,user_rating
0,[Documentary],"b""One Flew Over the Cuckoo's Nest (1975)""",138,0.4
1,"[Romance, History]",b'Strictly Ballroom (1992)',92,0.2
2,[Romance],"b'Very Brady Sequel, A (1996)'",301,0.4
3,"[Sci-Fi, Documentary]",b'Pulp Fiction (1994)',60,0.4
4,"[Animation, Mystery]",b'Scream 2 (1997)',197,0.3
5,"[Documentary, Mystery]",b'Crash (1996)',601,0.4
6,"[Comedy, Thriller, Romance, Crime]",b'Aladdin (1992)',710,0.3
7,"[Action, Sci-Fi, History]",b'True Romance (1993)',833,0.2
8,[Romance],b'Bob Roberts (1992)',916,0.5
9,"[Action, Drama, Music, War]",b'Starship Troopers (1997)',940,0.2


In [15]:
def safe_decode(value):
    if isinstance(value, bytes): 
        return value.decode('utf-8')
    return value  

# Combining columns to create more meaningful embeddings
df['title_and_genres'] = df.apply(lambda row: safe_decode(row['movie_title']) + " - " + ', '.join(row['movie_genres']), axis=1)
df.head()

Unnamed: 0,movie_genres,movie_title,user_id,user_rating,title_and_genres
0,[Documentary],"b""One Flew Over the Cuckoo's Nest (1975)""",138,0.4,One Flew Over the Cuckoo's Nest (1975) - Docum...
1,"[Romance, History]",b'Strictly Ballroom (1992)',92,0.2,"Strictly Ballroom (1992) - Romance, History"
2,[Romance],"b'Very Brady Sequel, A (1996)'",301,0.4,"Very Brady Sequel, A (1996) - Romance"
3,"[Sci-Fi, Documentary]",b'Pulp Fiction (1994)',60,0.4,"Pulp Fiction (1994) - Sci-Fi, Documentary"
4,"[Animation, Mystery]",b'Scream 2 (1997)',197,0.3,"Scream 2 (1997) - Animation, Mystery"


In [16]:
df['user_id'] = df['user_id'].astype(int)
df.head()

Unnamed: 0,movie_genres,movie_title,user_id,user_rating,title_and_genres
0,[Documentary],"b""One Flew Over the Cuckoo's Nest (1975)""",138,0.4,One Flew Over the Cuckoo's Nest (1975) - Docum...
1,"[Romance, History]",b'Strictly Ballroom (1992)',92,0.2,"Strictly Ballroom (1992) - Romance, History"
2,[Romance],"b'Very Brady Sequel, A (1996)'",301,0.4,"Very Brady Sequel, A (1996) - Romance"
3,"[Sci-Fi, Documentary]",b'Pulp Fiction (1994)',60,0.4,"Pulp Fiction (1994) - Sci-Fi, Documentary"
4,"[Animation, Mystery]",b'Scream 2 (1997)',197,0.3,"Scream 2 (1997) - Animation, Mystery"


# Model

In [17]:
model = SentenceTransformer("all-MiniLM-L6-v2")

## Embedding generator

In [18]:
df['embeddings'] = list(model.encode(df['title_and_genres']))

In [19]:
df.head(10)

Unnamed: 0,movie_genres,movie_title,user_id,user_rating,title_and_genres,embeddings
0,[Documentary],"b""One Flew Over the Cuckoo's Nest (1975)""",138,0.4,One Flew Over the Cuckoo's Nest (1975) - Docum...,"[-0.031615485, 0.016313665, -0.019859074, 0.05..."
1,"[Romance, History]",b'Strictly Ballroom (1992)',92,0.2,"Strictly Ballroom (1992) - Romance, History","[0.005666763, -0.0066977255, -0.030158766, -0...."
2,[Romance],"b'Very Brady Sequel, A (1996)'",301,0.4,"Very Brady Sequel, A (1996) - Romance","[-0.02939043, -0.0384848, -0.02393693, -0.0078..."
3,"[Sci-Fi, Documentary]",b'Pulp Fiction (1994)',60,0.4,"Pulp Fiction (1994) - Sci-Fi, Documentary","[-0.05960273, -0.036336366, -0.06602257, -0.01..."
4,"[Animation, Mystery]",b'Scream 2 (1997)',197,0.3,"Scream 2 (1997) - Animation, Mystery","[-0.0460111, -0.01660724, -0.008522717, 0.0194..."
5,"[Documentary, Mystery]",b'Crash (1996)',601,0.4,"Crash (1996) - Documentary, Mystery","[-0.007400912, 0.021577645, 0.0022665262, 0.02..."
6,"[Comedy, Thriller, Romance, Crime]",b'Aladdin (1992)',710,0.3,"Aladdin (1992) - Comedy, Thriller, Romance, Crime","[-0.044823356, 0.039115388, -0.024226058, -0.0..."
7,"[Action, Sci-Fi, History]",b'True Romance (1993)',833,0.2,"True Romance (1993) - Action, Sci-Fi, History","[-0.10908171, -0.043817002, 0.022998724, 0.077..."
8,[Romance],b'Bob Roberts (1992)',916,0.5,Bob Roberts (1992) - Romance,"[-0.025987634, 0.017029107, -0.07255637, -0.02..."
9,"[Action, Drama, Music, War]",b'Starship Troopers (1997)',940,0.2,"Starship Troopers (1997) - Action, Drama, Musi...","[-0.059337005, -0.075234905, -0.032056466, -0...."
