In [14]:
import pandas as pd
import gzip
import requests
import shutil

NAME_BASIC_URL = "https://datasets.imdbws.com/name.basics.tsv.gz"

response = requests.get(NAME_BASIC_URL, stream=True)
with open("name.basics.tsv.gz", "wb") as f:
    shutil.copyfileobj(response.raw, f)

with gzip.open("name.basics.tsv.gz", "rb") as f_in:
    with open("name.basics.tsv", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

df = pd.read_csv("name.basics.tsv", sep="\t", low_memory=False)

df.head(20)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0025164"
1,nm0000002,Lauren Bacall,1924,2014,"actress,miscellaneous,soundtrack","tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"
5,nm0000006,Ingrid Bergman,1915,1982,"actress,producer,soundtrack","tt0034583,tt0038109,tt0036855,tt0038787"
6,nm0000007,Humphrey Bogart,1899,1957,"actor,producer,miscellaneous","tt0034583,tt0043265,tt0037382,tt0033870"
7,nm0000008,Marlon Brando,1924,2004,"actor,director,writer","tt0078788,tt0068646,tt0047296,tt0070849"
8,nm0000009,Richard Burton,1925,1984,"actor,producer,director","tt0061184,tt0087803,tt0057877,tt0059749"
9,nm0000010,James Cagney,1899,1986,"actor,director,producer","tt0029870,tt0031867,tt0042041,tt0034236"


## Study
    - `Bruce Lee was born in 1940 and he was actor and producer.`

Here after checking the dataframe sample I assume that for the proposed problem I should take the data from: nconst,primaryName, birthYear and primaryProfession.

In [15]:
TITLE_BASIC_URL = "https://datasets.imdbws.com/title.basics.tsv.gz"

response = requests.get(TITLE_BASIC_URL, stream=True)
with open("title.basics.tsv.gz", "wb") as f:
    shutil.copyfileobj(response.raw, f)

with gzip.open("title.basics.tsv.gz", "rb") as f_in:
    with open("title.basics.tsv", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

df = pd.read_csv("title.basics.tsv", sep="\t", low_memory=False)

df.head(20)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,Short
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,\N,1,"Documentary,Short"
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
9,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,\N,1,"Documentary,Short"


## Study
    - `Blacksmith Scene, originally titled 'Les forgerons', is a documentary.`

Here after checking the dataframe sample I assume that for the proposed problem I should take the data from: nconst,primaryTitle, originalTitle and genres

## Check how to known if the reload should be done

To check if it is necessary to download again the file and insert data in the database.
First aproach is to check the modification date of the file from the url and in case it has changed download and load it again. Also added the configuration to add logging

In [16]:
import requests
import os
import logging
from datetime import datetime
from dateutil.parser import parse as parsedate

logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')

IMDB_DATASETS = [
    "name.basics.tsv.gz"
]

def download_if_changed(filename):
    url = f"https://datasets.imdbws.com/{filename}"
    local_path = filename
        
    resp = requests.head(url)
    remote_date = parsedate(resp.headers['Last-Modified'])
    
    if os.path.exists(local_path):
        local_date = datetime.fromtimestamp(
            os.path.getmtime(local_path)
        ).astimezone()
        
        if remote_date <= local_date:
            logging.info("Sin cambios")
            return False
    
    logging.info(f"{filename}: Descargando...")
    r = requests.get(url)
    with open(local_path, 'wb') as f:
        f.write(r.content)
    
    logging.info(f"{filename}: Actualizado")
    return True

download_if_changed(IMDB_DATASETS[0])

2025-12-06 15:00:09,290 - INFO - Sin cambios


False