# MovieLens Preprocessing: Build cleaned_movies.csv

This notebook loads the MovieLens small dataset, cleans movie titles and genres, computes per-movie average ratings and normalized scores, joins TMDb IDs for posters, and saves a compact `cleaned_movies.csv` for the Streamlit hybrid recommender.

Datasets used (from `ml-latest-small 2/`):
- `movies.csv` (movieId, title, genres)
- `ratings.csv` (userId, movieId, rating, timestamp)
- `links.csv` (movieId, imdbId, tmdbId)

Output:
- `/workspace/cleaned_movies.csv` with columns: `movieId`, `tmdbId`, `title` (lowercase), `genres_text`, `avg_rating`, `norm_rating`, `rating_count`.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path('ml-latest-small 2')
MOVIES_CSV = DATA_DIR / 'movies.csv'
RATINGS_CSV = DATA_DIR / 'ratings.csv'
LINKS_CSV = DATA_DIR / 'links.csv'
OUTPUT_CSV = Path('cleaned_movies.csv')


In [None]:
movies = pd.read_csv(MOVIES_CSV)
ratings = pd.read_csv(RATINGS_CSV)
links = pd.read_csv(LINKS_CSV)
movies.head(), ratings.head(), links.head()


In [None]:
# Clean titles and genres
movies['title'] = movies['title'].fillna('').str.strip().str.lower()
movies['genres'] = movies['genres'].fillna('(no genres listed)')
# Split and normalize genres
movies['genres_list'] = movies['genres'].str.split('|')
movies['genres_list'] = movies['genres_list'].apply(lambda lst: [g.strip().lower().replace('-', ' ') for g in lst] if isinstance(lst, list) else [])
movies['genres_text'] = movies['genres_list'].apply(lambda lst: ' '.join(sorted(set(lst))))
movies[['movieId','title','genres_text']].head()


In [None]:
# Compute average rating and counts per movie
ratings = ratings.dropna(subset=['movieId','rating'])
agg = ratings.groupby('movieId').agg(avg_rating=('rating','mean'), rating_count=('rating','size')).reset_index()
# Normalize avg_rating between 0 and 1
if not agg.empty:
    min_r, max_r = agg['avg_rating'].min(), agg['avg_rating'].max()
    if max_r > min_r:
        agg['norm_rating'] = (agg['avg_rating'] - min_r) / (max_r - min_r)
    else:
        agg['norm_rating'] = 0.5
else:
    agg['norm_rating'] = pd.Series(dtype=float)
agg.head()


In [None]:
# Merge datasets
df = movies.merge(links[['movieId','tmdbId']], on='movieId', how='left')
df = df.merge(agg, on='movieId', how='left')
# Fill missing ratings
df['avg_rating'] = df['avg_rating'].fillna(0.0)
df['norm_rating'] = df['norm_rating'].fillna(0.0)
df['rating_count'] = df['rating_count'].fillna(0).astype(int)
# Clean tmdbId to int where possible
def to_int_or_none(x):
    try:
        xi = int(x)
        return xi if xi > 0 else np.nan
    except Exception:
        return np.nan
df['tmdbId'] = df['tmdbId'].apply(to_int_or_none)
# Select columns and save
out_cols = ['movieId','tmdbId','title','genres_text','avg_rating','norm_rating','rating_count']
cleaned = df[out_cols].copy()
cleaned.to_csv(OUTPUT_CSV, index=False)
cleaned.head(10)


Now you can run the Streamlit app which loads `cleaned_movies.csv`.