# Laying out the Key Metric
## In the context of a movie studio investing in a major title, a hit movie typically means:

### High audience approval (quality)

### Strong audience engagement (reach)

### Significant commercial impact (ROI or buzz)

In [2]:
import pandas as pd
import numpy as np

# Load ratings first (small size)
ratings = pd.read_csv(
    "title.ratings.tsv.gz",
    sep="\t",
    compression="gzip",
    dtype={"tconst": "category", "averageRating": "float32", "numVotes": "int32"}
)

# Prepare empty list to accumulate valid rows
filtered_basics = []

# Load basics file in chunks
chunksize = 100_000
for chunk in pd.read_csv(
    "title.basics.tsv.gz",
    sep="\t",
    compression="gzip",
    usecols=["tconst", "titleType", "primaryTitle", "startYear", "genres"],
    dtype=str,
    na_values="\\N",
    chunksize=chunksize
):
    # Filter for movies from 2019 to 2024
    chunk = chunk[chunk["titleType"] == "movie"]
    chunk["startYear"] = pd.to_numeric(chunk["startYear"], errors="coerce")
    chunk = chunk[chunk["startYear"].between(2019, 2024)]
    chunk = chunk.dropna(subset=["genres", "startYear"])
    filtered_basics.append(chunk)

# Combine all chunks into one DataFrame
basics = pd.concat(filtered_basics)

# Downcast dtypes to save memory
basics["startYear"] = basics["startYear"].astype("int16")
basics["tconst"] = basics["tconst"].astype("category")

# Merge with ratings
merged = pd.merge(basics, ratings, on="tconst")

# Compute Bayesian average
C = merged["averageRating"].mean()
m = 1000

merged["bayesianRating"] = (
    (merged["numVotes"] / (merged["numVotes"] + m)) * merged["averageRating"] +
    (m / (merged["numVotes"] + m)) * C
)

# WSS: Weighted Score = BayesianRating * log10(numVotes)
merged["WSS"] = (merged["bayesianRating"] / 10) * np.log10(merged["numVotes"].replace(0, np.nan))
merged.dropna(subset=["WSS"], inplace=True)

# Show top 10 hit movies by WSS
top_hits = merged[["primaryTitle", "startYear", "averageRating", "numVotes", "bayesianRating", "WSS"]]
top_hits = top_hits.sort_values(by="WSS", ascending=False).head(10)

top_hits.head(10)


Unnamed: 0,primaryTitle,startYear,averageRating,numVotes,bayesianRating,WSS
51608,Avengers: Endgame,2019,8.4,1376326,8.39846,5.155581
54615,Joker,2019,8.3,1627103,8.298759,5.154704
53835,Parasite,2019,8.5,1083272,8.497952,5.128291
22682,Oppenheimer,2023,8.3,916593,8.297799,4.947294
21946,Dune: Part Two,2024,8.5,651601,8.496598,4.939907
4257,Spider-Man: No Way Home,2021,8.2,966005,8.198014,4.906495
4040,The Kashmir Files,2022,8.5,576610,8.496156,4.894536
26563,Top Gun: Maverick,2022,8.2,793121,8.197582,4.836032
59701,Spider-Man: Across the Spider-Verse,2023,8.5,469884,8.495285,4.818518
57335,1917,2019,8.2,736809,8.197397,4.809704


In [3]:
import plotly.express as px

# Assume 'top_hits' DataFrame already exists from previous step
fig = px.bar(
    top_hits,
    x="primaryTitle",
    y="WSS",
    color="startYear",
    hover_data=["averageRating", "numVotes", "bayesianRating"],
    title="Top Hit Movies (2019–2024) by Weighted Score (Bayesian)",
    labels={"primaryTitle": "Movie Title", "WSS": "Weighted Score"}
)

fig.update_layout(
    xaxis_tickangle=45,
    xaxis_title="Movie Title",
    yaxis_title="Weighted Score (WSS)",
    legend_title="Release Year",
    height=600
)

fig.show()
