
# IMDb Quick Peek Notebook
Mục tiêu: **tải datasets IMDb chính chủ** (TSV.GZ), giải nén, và **xem nhanh** vài dòng dữ liệu để làm quen schema.

> Nguồn chính thức: `https://datasets.imdbws.com/` (cập nhật hằng ngày, dùng cho mục đích phi thương mại).  


In [3]:
import os, gzip, shutil, requests
from pathlib import Path
from datetime import datetime
import pandas as pd
from tqdm import tqdm

IMDB_BASE = "https://datasets.imdbws.com"
DATA_DIR = Path.cwd() / "data"   # thư mục lưu file tải về
DATA_DIR.mkdir(parents=True, exist_ok=True)

FILES = [
    "title.basics.tsv.gz",
    "title.ratings.tsv.gz",
    "title.crew.tsv.gz",
    "title.principals.tsv.gz",
    "name.basics.tsv.gz"
]

print('Data dir =', DATA_DIR)

Data dir = d:\Data Science\IMDb\data


In [4]:
def download_file(url: str, out_path: Path, chunk_size=1<<20):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with requests.get(url, stream=True, timeout=60) as r:
        r.raise_for_status()
        total = int(r.headers.get("content-length", 0))
        with open(out_path, "wb") as f, tqdm(total=total, unit='B', unit_scale=True, desc=out_path.name) as bar:
            for chunk in r.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)
                    bar.update(len(chunk))

def gunzip_file(gz_path: Path, out_path: Path):
    with gzip.open(gz_path, "rb") as fin, open(out_path, "wb") as fout:
        shutil.copyfileobj(fin, fout)
    return out_path

In [5]:
# Tải các file đã chọn trong FILES
for fname in FILES:
    url = f"{IMDB_BASE}/{fname}"
    gz_path = DATA_DIR / fname
    if gz_path.exists():
        print(f"[SKIP] Đã có {gz_path.name}")
    else:
        print(f"[GET] {url}")
        download_file(url, gz_path)
print("Done downloading.")

[SKIP] Đã có title.basics.tsv.gz
[SKIP] Đã có title.ratings.tsv.gz
[SKIP] Đã có title.crew.tsv.gz
[SKIP] Đã có title.principals.tsv.gz
[SKIP] Đã có name.basics.tsv.gz
Done downloading.


In [6]:
# Giải nén .gz thành .tsv
tsv_paths = []
for fname in FILES:
    gz_path = DATA_DIR / fname
    tsv_path = gz_path.with_suffix("")  # bỏ .gz
    if tsv_path.exists() and tsv_path.stat().st_size > 0:
        print(f"[SKIP] Đã có {tsv_path.name}")
    else:
        print(f"[EXTRACT] {gz_path.name} -> {tsv_path.name}")
        gunzip_file(gz_path, tsv_path)
    tsv_paths.append(tsv_path)
tsv_paths

[SKIP] Đã có title.basics.tsv
[SKIP] Đã có title.ratings.tsv
[SKIP] Đã có title.crew.tsv
[SKIP] Đã có title.principals.tsv
[SKIP] Đã có name.basics.tsv


[WindowsPath('d:/Data Science/IMDb/data/title.basics.tsv'),
 WindowsPath('d:/Data Science/IMDb/data/title.ratings.tsv'),
 WindowsPath('d:/Data Science/IMDb/data/title.crew.tsv'),
 WindowsPath('d:/Data Science/IMDb/data/title.principals.tsv'),
 WindowsPath('d:/Data Science/IMDb/data/name.basics.tsv')]

In [7]:
TSV_FILES = [
    DATA_DIR / "title.basics.tsv",
    DATA_DIR / "title.ratings.tsv",
    DATA_DIR / "title.crew.tsv",
    DATA_DIR / "title.principals.tsv",
    DATA_DIR / "name.basics.tsv",
]

def file_size_mb(p: Path) -> float:
    return os.path.getsize(p) / (1024*1024)

def count_rows_fast(p: Path) -> int:
    # Đếm số dòng nhanh (trừ 1 dòng header).
    with open(p, "rb") as f:
        return sum(1 for _ in f) - 1

def tsv_schema(p: Path):
    # Chỉ đọc header để lấy tên cột
    df0 = pd.read_csv(p, sep="\t", nrows=0)
    cols = list(df0.columns)
    return cols, len(cols)

def tsv_head(p: Path, n=5):
    # Đọc vài dòng đầu để xem dữ liệu; xử lý thiếu \N
    return pd.read_csv(p, sep="\t", nrows=n, na_values="\\N")

summary = []
samples = {}

for path in TSV_FILES:
    cols, n_cols = tsv_schema(path)
    n_rows = count_rows_fast(path)
    size_mb = file_size_mb(path)
    head_df = tsv_head(path, n=5)
    samples[path.name] = head_df

    summary.append({
        "file": path.name,
        "size_mb": round(size_mb, 2),
        "n_rows": n_rows,
        "n_cols": n_cols,
        "columns": ", ".join(cols), 
    })

summary_df = pd.DataFrame(summary).sort_values("file")
summary_df

Unnamed: 0,file,size_mb,n_rows,n_cols,columns
4,name.basics.tsv,866.51,14783561,6,"nconst, primaryName, birthYear, deathYear, pri..."
0,title.basics.tsv,988.58,11981307,9,"tconst, titleType, primaryTitle, originalTitle..."
2,title.crew.tsv,377.02,11981307,3,"tconst, directors, writers"
3,title.principals.tsv,4053.76,95337828,6,"tconst, ordering, nconst, category, job, chara..."
1,title.ratings.tsv,27.03,1627391,3,"tconst, averageRating, numVotes"


In [8]:
for fname, df in samples.items():
    print(f"\n=== {fname} : sample ===")
    display(df)


=== title.basics.tsv : sample ===


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,Short



=== title.ratings.tsv : sample ===


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2182
1,tt0000002,5.5,302
2,tt0000003,6.4,2256
3,tt0000004,5.2,194
4,tt0000005,6.2,2994



=== title.crew.tsv : sample ===


Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,nm0721526
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,



=== title.principals.tsv : sample ===


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0005690,producer,producer,
3,tt0000001,4,nm0374658,cinematographer,director of photography,
4,tt0000002,1,nm0721526,director,,



=== name.basics.tsv : sample ===


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987.0,"actor,miscellaneous,producer","tt0050419,tt0072308,tt0027125,tt0025164"
1,nm0000002,Lauren Bacall,1924,2014.0,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,1934,,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982.0,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007.0,"writer,director,actor","tt0050986,tt0069467,tt0083922,tt0050976"


In [9]:
from dotenv import load_dotenv
load_dotenv()

True

In [10]:
TMDB_KEY = os.getenv("TMDB_API_KEY")
assert TMDB_KEY, "Bạn cần TMDB_API_KEY trong .env"

In [11]:
# Đọc nhanh 300k dòng đầu để lọc
basics_path = DATA_DIR / "title.basics.tsv"
ratings_path = DATA_DIR / "title.ratings.tsv"

dfb = pd.read_csv(basics_path, sep="\t", na_values="\\N", dtype=str)
dfr = pd.read_csv(ratings_path, sep="\t", na_values="\\N", dtype={"tconst":str,"averageRating":float,"numVotes":int})

# ép kiểu
dfb["startYear"] = pd.to_numeric(dfb["startYear"], errors="coerce")
mask = (dfb["titleType"]=="movie") & (dfb["startYear"]>=2000)
movies = dfb.loc[mask, ["tconst","primaryTitle","startYear"]].merge(dfr, on="tconst", how="left")

# Lấy top theo numVotes để enrich trước (ví dụ 100 phim)
seed = movies.dropna(subset=["numVotes"]).sort_values("numVotes", ascending=False).head(100).reset_index(drop=True)
seed[["tconst","primaryTitle","startYear","averageRating","numVotes"]].head()

Unnamed: 0,tconst,primaryTitle,startYear,averageRating,numVotes
0,tt0468569,The Dark Knight,2008.0,9.1,3085284.0
1,tt1375666,Inception,2010.0,8.8,2740322.0
2,tt0816692,Interstellar,2014.0,8.7,2418261.0
3,tt0120737,The Lord of the Rings: The Fellowship of the Ring,2001.0,8.9,2147712.0
4,tt0167260,The Lord of the Rings: The Return of the King,2003.0,9.0,2113076.0


In [12]:
import math, time

def tmdb_get(path, params={}):
    base = "https://api.themoviedb.org/3"
    p = {"api_key": TMDB_KEY, **params}
    for attempt in range(3):
        r = requests.get(f"{base}{path}", params=p, timeout=20)
        if r.status_code == 429:
            # rate limit - chờ theo Retry-After nếu có
            wait = int(r.headers.get("Retry-After", "2"))
            time.sleep(wait)
            continue
        r.raise_for_status()
        return r.json()
    raise RuntimeError(f"TMDb error for {path}")

def tmdb_find_by_imdb(ttid):
    j = tmdb_get(f"/find/{ttid}", params={"external_source":"imdb_id"})
    lst = j.get("movie_results", []) or j.get("tv_results", [])
    return lst[0]["id"] if lst else None

def tmdb_movie_full(tmdb_id):
    return tmdb_get(f"/movie/{tmdb_id}", params={"append_to_response":"images,videos,keywords"})

enriched = []
for _, row in seed.iterrows():
    ttid = row["tconst"]
    tmdb_id = tmdb_find_by_imdb(ttid)
    if not tmdb_id:
        enriched.append({"tconst": ttid, "tmdb_id": None})
        continue
    info = tmdb_movie_full(tmdb_id)
    # gọn những trường hay dùng
    poster = info.get("poster_path")
    revenue = info.get("revenue")
    budget  = info.get("budget")
    kws = [k["name"] for k in (info.get("keywords", {}) or {}).get("keywords", [])]
    enriched.append({
        "tconst": ttid,
        "tmdb_id": tmdb_id,
        "poster_path": poster,
        "budget_tmdb": budget,
        "revenue_tmdb": revenue,
        "keywords": ";".join(kws) if kws else None
    })
    time.sleep(0.15)  # nhẹ quota

df_tmdb = pd.DataFrame(enriched)
df_tmdb.head()

Unnamed: 0,tconst,tmdb_id,poster_path,budget_tmdb,revenue_tmdb,keywords
0,tt0468569,155,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,185000000,1004558444,sadism;chaos;secret identity;crime fighter;sup...
1,tt1375666,27205,/ljsZTbVsrQSqZgWeep2B1QiDKuh.jpg,160000000,839030630,"rescue;mission;dreams;airplane;paris, france;v..."
2,tt0816692,157336,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,165000000,746606706,rescue;future;spacecraft;race against time;art...
3,tt0120737,120,/6oom5QYQ2yQTMJIbnvbkBL9cHo6.jpg,93000000,871368364,based on novel or book;elves;dwarf;magic;obses...
4,tt0167260,122,/rCzpDGLbOoPwLjy3OAm5NUPOTrC.jpg,94000000,1118888979,army;based on novel or book;elves;dwarf;magic;...


In [13]:
df_tmdb

Unnamed: 0,tconst,tmdb_id,poster_path,budget_tmdb,revenue_tmdb,keywords
0,tt0468569,155,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,185000000,1004558444,sadism;chaos;secret identity;crime fighter;sup...
1,tt1375666,27205,/ljsZTbVsrQSqZgWeep2B1QiDKuh.jpg,160000000,839030630,"rescue;mission;dreams;airplane;paris, france;v..."
2,tt0816692,157336,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,165000000,746606706,rescue;future;spacecraft;race against time;art...
3,tt0120737,120,/6oom5QYQ2yQTMJIbnvbkBL9cHo6.jpg,93000000,871368364,based on novel or book;elves;dwarf;magic;obses...
4,tt0167260,122,/rCzpDGLbOoPwLjy3OAm5NUPOTrC.jpg,94000000,1118888979,army;based on novel or book;elves;dwarf;magic;...
...,...,...,...,...,...,...
95,tt0121765,1894,/oZNPzxqM2s5DyVWab09NTQScDQt.jpg,120000000,649398328,army;laser gun;senate;investigation;cult figur...
96,tt2911666,245891,/fZPSd91yGE9fCcCe6OoQr6E3Bev.jpg,20000000,88761661,hitman;bratva (russian mafia);gangster;secret ...
97,tt0144084,1359,/9uGHEgsiUXjCNq8wdq4r49YL8A1.jpg,7000000,34300000,new york city;based on novel or book;businessm...
98,tt1285016,37799,/n0ybibhJtQ5icDqTp8eRytcIHJx.jpg,40000000,224920315,"hacker;based on novel or book;boston, massachu..."


In [14]:
out = DATA_DIR / "enrich_tmdb_top100.csv"
df_tmdb.to_csv(out, index=False)
out

WindowsPath('d:/Data Science/IMDb/data/enrich_tmdb_top100.csv')