## Enviroment setup

In [None]:
%pip install pandas numpy requests Pillow dotenv

## Imports

In [37]:
import pandas as pd
import numpy as np
import requests
from PIL import Image
from io import BytesIO
import os
import re
from typing import Optional
import time
from dotenv import load_dotenv

## Visualizing data

In [27]:
movies_train = pd.read_csv('data/movies_train.csv')
movies_test = pd.read_csv('data/movies_test.csv')

print("Movies Train:\n", movies_train.head(), '\n', movies_train.columns)
print("\nMovies Test:\n", movies_test.head(), '\n', movies_test.columns)

Movies Train:
    movieId                             title          genres  \
0      619                         Ed (1996)          Comedy   
1    33826                Saint Ralph (2004)    Comedy|Drama   
2     1298       Pink Floyd: The Wall (1982)   Drama|Musical   
3   140289              Men & Chicken (2015)    Comedy|Drama   
4     3064  Poison Ivy: New Seduction (1997)  Drama|Thriller   

   (no genres listed)  Action  Adventure  Animation  Children  Comedy  Crime  \
0                   0       0          0          0         0       1      0   
1                   0       0          0          0         0       1      0   
2                   0       0          0          0         0       0      0   
3                   0       0          0          0         0       1      0   
4                   0       0          0          0         0       0      0   

   ...  Film-Noir  Horror  IMAX  Musical  Mystery  Romance  Sci-Fi  Thriller  \
0  ...          0       0     0        

In [28]:
movielens_movies = pd.read_csv('ml-25m/movies.csv')
movielens_links = pd.read_csv('ml-25m/links.csv')

print("Movielens Movies:\n", movielens_movies.head(), '\n', movielens_movies.columns)
print("\nMovielens Links:\n", movielens_links.head(), '\n', movielens_links.columns)

Movielens Movies:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy   
 Index(['movieId', 'title', 'genres'], dtype='object')

Movielens Links:
    movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0 
 Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')


## Merge movies_train/test with MovieLens

In [29]:
train_merged = movies_train[['movieId']].merge(
    movielens_movies,
    on='movieId',
    how='left'
)

test_merged = movies_test[['movieId']].merge(
    movielens_movies,
    on='movieId',
    how='left'
)

print("Merged Train:\n", train_merged.head())
print("\nMerged Test:\n", test_merged.head())

Merged Train:
    movieId                             title          genres
0      619                         Ed (1996)          Comedy
1    33826                Saint Ralph (2004)    Comedy|Drama
2     1298       Pink Floyd: The Wall (1982)   Drama|Musical
3   140289              Men & Chicken (2015)    Comedy|Drama
4     3064  Poison Ivy: New Seduction (1997)  Drama|Thriller

Merged Test:
    movieId                                   title  \
0    45635       Notorious Bettie Page, The (2005)   
1     1373  Star Trek V: The Final Frontier (1989)   
2     7325                  Starsky & Hutch (2004)   
3      389              Colonel Chabert, Le (1994)   
4     8920                Country Girl, The (1954)   

                         genres  
0                         Drama  
1                 Action|Sci-Fi  
2  Action|Comedy|Crime|Thriller  
3             Drama|Romance|War  
4                         Drama  


## Merge con Links

In [30]:
train_complete = train_merged.merge(
    movielens_links,
    on='movieId',
    how='inner'
)

test_complete = test_merged.merge(
    movielens_links,
    on='movieId',
    how='inner'
)

print("Complete Train:\n", train_complete.head())
print("\nComplete Test:\n", test_complete.head())

Complete Train:
    movieId                             title          genres   imdbId  \
0      619                         Ed (1996)          Comedy   116165   
1    33826                Saint Ralph (2004)    Comedy|Drama   384488   
2     1298       Pink Floyd: The Wall (1982)   Drama|Musical    84503   
3   140289              Men & Chicken (2015)    Comedy|Drama  3877674   
4     3064  Poison Ivy: New Seduction (1997)  Drama|Thriller   119908   

     tmdbId  
0   32308.0  
1   25248.0  
2   12104.0  
3  296313.0  
4   18222.0  

Complete Test:
    movieId                                   title  \
0    45635       Notorious Bettie Page, The (2005)   
1     1373  Star Trek V: The Final Frontier (1989)   
2     7325                  Starsky & Hutch (2004)   
3      389              Colonel Chabert, Le (1994)   
4     8920                Country Girl, The (1954)   

                         genres  imdbId   tmdbId  
0                         Drama  404802  15402.0  
1               

## Extract years from titles

In [31]:
def extract_year(title: str) -> Optional[int]:
    if pd.isna(title):
        return None
    match = re.search(r'\((\d{4})\)', title)
    return int(match.group(1)) if match else None

train_complete['year'] = train_complete['title'].apply(extract_year)
test_complete['year'] = test_complete['title'].apply(extract_year)

## Find and fix missing values

In [34]:
train_missing = train_complete[train_complete['year'].isna()]
test_missing = test_complete[test_complete['year'].isna()]

print(f"Train missing years: {len(train_missing)}")
display(train_missing)

print(f"Test missing years: {len(test_missing)}")
display(test_missing)

Train missing years: 11


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,year
832,156605,Paterson,(no genres listed),5247022,370755.0,
2482,171749,Death Note: Desu nôto (2006–2007),(no genres listed),877057,419787.0,
3745,171631,Maria Bamford: Old Baby,(no genres listed),6264596,455601.0,
3752,140956,Ready Player One,Action|Sci-Fi|Thriller,1677720,333339.0,
4269,167570,The OA,(no genres listed),4635282,432192.0,
4415,171495,Cosmos,(no genres listed),81846,409926.0,
4459,171891,Generation Iron 2,(no genres listed),6263642,447818.0,
4627,176601,Black Mirror,(no genres listed),2492564,452830.0,
5379,147250,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),229922,127605.0,
5819,143410,Hyena Road,(no genres listed),4034452,316042.0,


Test missing years: 1


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,year
507,162414,Moonlight,Drama,4975722,376867.0,


In [38]:
load_dotenv()
TMDB_API_KEY = os.getenv('TMDB_API_KEY')
BASE_URL = 'https://api.themoviedb.org/3'

def get_year_from_tmdb(row):
    if pd.notna(row['year']):
        return row['year']
    
    if pd.isna(row['tmdbId']):
        return None
    
    try:
        url = f"{BASE_URL}/movie/{int(row['tmdbId'])}"
        response = requests.get(url, params={'api_key': TMDB_API_KEY}, timeout=10)
        
        if response.status_code == 200:
            release_date = response.json().get('release_date', '')
            if release_date:
                time.sleep(0.25)
                return int(release_date.split('-')[0])
        
        time.sleep(0.25)
        return None
    except:
        return None
    
train_complete['year'] = train_complete.apply(get_year_from_tmdb, axis=1) # type: ignore
test_complete['year'] = test_complete.apply(get_year_from_tmdb, axis=1) # type: ignore

print("Missing years after TMDB lookup:", train_complete['year'].isna().sum(), test_complete['year'].isna().sum())

Missing years after TMDB lookup: 4 0
