# Import libraries and cleaned data

In [16]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import joblib

# Load cleaned data
movies = pd.read_pickle('movies_df.pkl')

# BERT model

In [17]:
# Initialize BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Generate BERT embeddings
tqdm.pandas(desc="Generating BERT embeddings") # Progress bar
movies['embedding'] = movies['combined_info'].progress_apply(lambda x: get_bert_embeddings(x))

Generating BERT embeddings: 100%|██████████████████████████████████████████████████| 1682/1682 [02:21<00:00, 11.86it/s]


# Save BERT embeddings

In [18]:
joblib.dump(movies[['movieId', 'embedding']], 'movies_embeddings.pkl')

['movies_embeddings.pkl']