In [1]:
import pandas as pd
from pathlib import Path

ratings_csv = Path("../data/raw/tmdb_6000/tmdb_6000_movie_ratings.csv")
out = Path("../data/processed/ratings_summary.csv")

chunksize = 200_000  # safe value

agg = {}  # tmdbId -> (count, mean incremental)

for chunk in pd.read_csv(ratings_csv, chunksize=chunksize):
    
    # ensure proper column name
    if 'movieId' in chunk.columns and 'tmdbId' not in chunk.columns:
        chunk = chunk.rename(columns={'movieId':'tmdbId'})
    
    grp = chunk.groupby("tmdbId")["rating"].agg(['count', 'mean'])
    
    for mid, row in grp.iterrows():
        cnt = int(row['count'])
        mean = float(row['mean'])
        
        if mid in agg:
            prev_cnt, prev_mean = agg[mid]
            new_cnt = prev_cnt + cnt
            new_mean = (prev_mean * prev_cnt + mean * cnt) / new_cnt
            agg[mid] = (new_cnt, new_mean)
        else:
            agg[mid] = (cnt, mean)

# convert to DataFrame
agg_df = pd.DataFrame(
    [(mid, v[0], v[1]) for mid,v in agg.items()],
    columns=['tmdbId','rating_count','rating_mean']
)

agg_df.to_csv(out, index=False)
print("Saved:", out)
agg_df.head()


Saved: ..\data\processed\ratings_summary.csv


Unnamed: 0,tmdbId,rating_count,rating_mean
0,58,22235,3.470362
1,285,15945,3.395484
2,559,11784,2.880049
3,767,21849,3.826949
4,1452,6252,2.883877
