In [1]:
# Install dependencies if not already installed
!pip install -qU pandas llama-index-embeddings-gemini google-genai tqdm


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llama-index-readers-file 0.5.6 requires pandas<3,>=2.0.0, but you have pandas 3.0.0 which is incompatible.[0m[31m
[0m

In [1]:
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger('llama_index').setLevel(logging.ERROR)

import pandas as pd
import ast
import os
import time
import getpass
from tqdm import tqdm
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
from google.genai.types import EmbedContentConfig


In [2]:
# Setup Google API Key
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API Key: ")


In [17]:
def load_and_process_data(data_path):
    print('Loading movies_metadata.csv...')
    movies = pd.read_csv(os.path.join(data_path, "movies_metadata.csv"), low_memory=False)
    # Clean IDs
    movies = movies[pd.to_numeric(movies['id'], errors='coerce').notnull()]
    movies['id'] = movies['id'].astype(int)
    
    print('Loading credits.csv...')
    credits = pd.read_csv(os.path.join(data_path, "credits.csv"))
    credits['id'] = credits['id'].astype(int)
    
    print('Merging data...')
    data = movies.merge(credits, on='id')
    return data

data_path = "./data"
df_raw = load_and_process_data(data_path)
print(f"Total raw rows: {len(df_raw)}")


Loading movies_metadata.csv...
Loading credits.csv...
Merging data...
Total raw rows: 45538


In [18]:
def parse_json_col(x, key):
    try:
        items = ast.literal_eval(x)
        if isinstance(items, list):
            return [i[key] for i in items if isinstance(i, dict) and key in i]
        return []
    except:
        return []

print('Preprocessing columns (this may take a moment)...')
# Process necessary columns
df_raw['genres_list'] = df_raw['genres'].apply(lambda x: parse_json_col(x, 'name'))
df_raw['cast_list'] = df_raw['cast'].apply(lambda x: parse_json_col(x, 'name')[:5]) # Top 5 cast
df_raw['languages_list'] = df_raw['spoken_languages'].apply(lambda x: parse_json_col(x, 'name'))
df_raw['fullplot'] = df_raw['overview'].fillna('')
df_raw['title'] = df_raw['title'].fillna('')
df_raw['rating'] = pd.to_numeric(df_raw['vote_average'], errors='coerce').fillna(0)


Preprocessing columns (this may take a moment)...


In [19]:
# Create the text to embed
def create_combined_text(row):
    return (
        f"Title: {row['title']}\n"
        f"Plot: {row['fullplot']}\n"
        f"Cast: {', '.join(row['cast_list'])}\n"
        f"Genres: {', '.join(row['genres_list'])}\n"
        f"Languages: {', '.join(row['languages_list'])}\n"
        f"Rating: {row['rating']}"
    )

df_raw['combined_text'] = df_raw.apply(create_combined_text, axis=1)
print("Combined text created.")
print("Sample:")
print(df_raw['combined_text'].iloc[0])


Combined text created.
Sample:
Title: Toy Story
Plot: Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.
Cast: Tom Hanks, Tim Allen, Don Rickles, Jim Varney, Wallace Shawn
Genres: Animation, Comedy, Family
Languages: English
Rating: 7.7


In [26]:
# Initialize Google GenAI Embedding
model_name = "gemini-embedding-001"
embed_model = GoogleGenAIEmbedding(
    model_name=model_name,
    embedding_config=EmbedContentConfig(output_dimensionality=3072)
)
# Testing with one example
try:
    test_embed = embed_model.get_text_embedding("Hello World")
    print(f"Embedding successful. Dimension: {len(test_embed)}")
except Exception as e:
    print(f"Error initializing embedding model: {e}")


Embedding successful. Dimension: 3072


In [27]:
# --- SANITY CHECK ---
# Let's verify embeddings working on the actual data before running the full job
print("Running sanity check on top 3 rows...")
sample_texts = df_raw['combined_text'].head(3).tolist()
try:
    sample_embeddings = embed_model.get_text_embedding_batch(sample_texts)
    print(f"Successfully generated {len(sample_embeddings)} embeddings.")
    print(f"Embedding dimensions: {len(sample_embeddings[0])}")
    print("First 5 values of the first embedding:", sample_embeddings[0][:5])
    print("Sanity check PASSED. You can proceed to the full batch run below.")
except Exception as e:
    print(f"Sanity check FAILED: {e}")
    raise e

Running sanity check on top 3 rows...
Successfully generated 3 embeddings.
Embedding dimensions: 3072
First 5 values of the first embedding: [-0.027750747, -0.0032961427, 0.00066539174, -0.062563874, -0.009756115]
Sanity check PASSED. You can proceed to the full batch run below.


In [28]:
# Generate Embeddings with Batching
import math

# Configuration
BATCH_SIZE = 50 # Safe batch size
SAVE_INTERVAL = 1000 # Save every 1000 rows to a temp file
OUTPUT_FILE = os.path.join(data_path, "movies_with_embeddings.csv")

# work on a copy
df_processed = df_raw.copy()
embeddings = []
texts = df_processed['combined_text'].tolist()
total_batches = math.ceil(len(texts) / BATCH_SIZE)

print(f"Starting embedding generation for {len(texts)} items in {total_batches} batches...")

for i in tqdm(range(0, len(texts), BATCH_SIZE)):
    batch_texts = texts[i : i + BATCH_SIZE]
    try:
        # LlamaIndex helper for batch embeddings
        batch_embeddings = embed_model.get_text_embedding_batch(batch_texts)
        embeddings.extend(batch_embeddings)
        
        # Rate limit handling (Free tier is 15 RPM, we are doing batches)
        # A batch of 50 counts as ? If it counts as 1 request, we can go faster.
        # If it counts as 50 requests, we must sleep.
        # Usually batch endpoints count as 1 HTTP request but quota might be token based.
        # We'll sleep a little just to be safe and avoid 429s.
        time.sleep(2) 
        
    except Exception as e:
        print(f"Error at batch {i}: {e}")
        # Append None or zeros? Better to handle gracefully.
        # For now, we'll try to append empty lists to keep alignment
        embeddings.extend([[] for _ in range(len(batch_texts))])
        time.sleep(10) # Longer sleep on error

# Add embeddings to dataframe
df_processed['embedding'] = embeddings

# Filter out failed embeddings
df_final = df_processed[df_processed['embedding'].apply(len) > 0]
print(f"Finished. Successful embeddings: {len(df_final)} / {len(df_processed)}")


Starting embedding generation for 45538 items in 911 batches...


100%|██████████| 911/911 [1:08:42<00:00,  4.53s/it]


Finished. Successful embeddings: 45538 / 45538


In [None]:
# Save to CSV
print(f"Saving to {OUTPUT_FILE}...")
df_final.to_csv(OUTPUT_FILE, index=False)
print("Done.")


Saving to ./data/movies_with_embeddings.csv...
