In [20]:
import pandas as pd

df = pd.read_csv("./letterboxd_films.csv")

df

Unnamed: 0,name,rating,genres,avg_user_rating
0,mickey-17,★★★★,"Comedy, Adventure, Science Fiction",3.7
1,companion-2025,★★★★,"Thriller, Horror, Science Fiction, Comedy",3.5
2,nosferatu-2024,★★★½,"Fantasy, Horror",3.7
3,gladiator-ii,★★★★,"Drama, Action, Adventure",3.3
4,the-wild-robot,★★★★½,"Family, Science Fiction, Animation",4.2
...,...,...,...,...
464,grease,★★★,"Comedy, Romance",3.5
465,star-wars,★★★★,"Adventure, Action, Science Fiction",4.2
466,rocky,★★★,Drama,4.1
467,the-exorcist,★★★★,Horror,4.0


In [None]:
import pandas as pd
import requests
import re
import time
from tqdm import tqdm
import multiprocessing


# Function from letterboxd2imdb.py
def get_imdb_id(letterboxd_uri):
    try:
        resp = requests.get(letterboxd_uri)
        if resp.status_code != 200:
            return None

        # Extract the IMDb ID
        re_match = re.findall(r'href=".+title/(tt\d+)/maindetails"', resp.text)
        if not re_match:
            return None

        return re_match[0]
    except Exception as e:
        print(f"Error fetching {letterboxd_uri}: {e}")
        return None

# Load data
letterboxd_df = pd.read_csv('letterboxd_films.csv')
links_df = pd.read_csv('data/links.csv')

# Convert imdbId in links to string with 'tt' prefix for proper matching
links_df['imdb_id'] = links_df['imdbId'].apply(lambda x: f"tt{x:07d}")

# Create Letterboxd URIs and lookup IMDb IDs
print("Looking up IMDb IDs from Letterboxd...")
imdb_ids = []
for name in tqdm(letterboxd_df['name']):
    # Construct Letterboxd URI
    letterboxd_uri = f"https://letterboxd.com/film/{name}/"
    
    # Get IMDb ID with a small delay to be nice to their servers
    imdb_id = get_imdb_id(letterboxd_uri)
    imdb_ids.append(imdb_id)
    time.sleep(0.5)  # Add a delay to avoid hitting rate limits

# Add IMDb IDs to DataFrame
letterboxd_df['imdb_id'] = imdb_ids

# Now match with links.csv to get movieId
print("Matching with MovieLens IDs...")
letterboxd_df = letterboxd_df.merge(
    links_df[['movieId', 'imdb_id']], 
    on='imdb_id', 
    how='left'
)

# Add tmdbId from links.csv
letterboxd_df = letterboxd_df.merge(
    links_df[['movieId', 'tmdbId']], 
    on='movieId', 
    how='left'
)

# Create Letterboxd URL using TMDb ID where available
letterboxd_df['letterboxd_url'] = letterboxd_df['tmdbId'].apply(
    lambda x: f"https://letterboxd.com/tmdb/{int(x)}/" if pd.notna(x) else None
)

# Save results
letterboxd_df.to_csv('letterboxd_films_with_ids.csv', index=False)

# Print statistics
matched = letterboxd_df['movieId'].notnull().sum()
print(f"Successfully matched {matched} out of {len(letterboxd_df)} movies ({matched/len(letterboxd_df)*100:.1f}%)")

# Print some examples
print("\nSample matches:")
sample = letterboxd_df.sample(min(10, len(letterboxd_df)))
for _, row in sample.iterrows():
    print(f"{row['name']} → IMDb ID: {row['imdb_id'] or 'Not found'} → MovieId: {row['movieId'] if pd.notna(row['movieId']) else 'Not found'}")

Looking up IMDb IDs from Letterboxd...


 11%|█▏        | 53/469 [00:48<06:19,  1.10it/s]


KeyboardInterrupt: 