In [2]:
import pandas as pd
import numpy as np
import joblib 

DATA_DIR = 'C:\\Md Shahid\\Liabilities\\Machine Learning Projects\\Recommendation System\\data\\raw\\' 
DATA_DIR_PROCESSED = 'C:\\Md Shahid\\Liabilities\\Machine Learning Projects\\Recommendation System\\data\\processed\\' 
CREDITS_FILE = DATA_DIR + 'credits.csv'
KEYWORDS_FILE = DATA_DIR + 'keywords.csv'
LINKS_FILE = DATA_DIR + 'links.csv'
MOVIES_META_FILE = DATA_DIR + 'movies_metadata.csv'
RATINGS_FILE = DATA_DIR + 'ratings_small.csv'

credits_df = pd.read_csv(CREDITS_FILE)
keywords_df = pd.read_csv(KEYWORDS_FILE)    
links_df = pd.read_csv(LINKS_FILE)
movies_meta = pd.read_csv(MOVIES_META_FILE, low_memory=False)
ratings_df = pd.read_csv(RATINGS_FILE)

print("Data loaded successfully.")
print(f"Initial credits count: {len(credits_df)}")
print(f"Initial keywords count: {len(keywords_df)}")
print(f"Initial links count: {len(links_df)}")
print(f"Initial metadata count: {len(movies_meta)}")
print(f"Initial ratings count: {len(ratings_df)}")

Data loaded successfully.
Initial credits count: 45476
Initial keywords count: 46419
Initial links count: 45843
Initial metadata count: 45466
Initial ratings count: 100004


In [3]:
movies_meta_clean = movies_meta[movies_meta['id'].astype(str).str.isnumeric()].copy()
movies_meta_clean['tmdbId'] = movies_meta_clean['id'].astype(int)
movies_meta_clean.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,tmdbId
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,862


In [4]:
credits_df.rename(columns={'id': 'tmdbId'}, inplace=True)
credits_df['tmdbId'] = pd.to_numeric(credits_df['tmdbId'], errors='coerce').fillna(0).astype(int)
keywords_df.rename(columns={'id': 'tmdbId'}, inplace=True)
keywords_df['tmdbId'] = pd.to_numeric(keywords_df['tmdbId'], errors='coerce').fillna(0).astype(int)

In [5]:
ratings_with_tmdb = ratings_df.merge(
    links_df[['movieId', 'tmdbId']],
    on='movieId', 
    how='inner'
)
print(f"Merge 1 (Ratings + Links) complete. Records: {len(ratings_with_tmdb)}")

Merge 1 (Ratings + Links) complete. Records: 99958


In [6]:
ratings_with_meta = ratings_with_tmdb.merge(
    movies_meta_clean[['tmdbId', 'title', 'genres', 'overview', 'vote_average', 'vote_count', 'adult',
     'original_language', 'original_title', 'popularity', 'release_date']], 
    on='tmdbId', 
    how='inner'
)
print(f"Merge 2 ((Ratings + Links) + Metadata) complete. Records: {len(ratings_with_meta)}")

Merge 2 ((Ratings + Links) + Metadata) complete. Records: 99850


In [7]:
ratings_with_credits = ratings_with_meta.merge(
    credits_df[['tmdbId', 'cast', 'crew']], 
    on='tmdbId', 
    how='inner'
)
print(f"Merge 3 (((Ratings + Links) + Metadata) + Credits) complete. Records: {len(ratings_with_credits)}")

Merge 3 (((Ratings + Links) + Metadata) + Credits) complete. Records: 99930


In [8]:
final_merged_df = ratings_with_credits.merge(
    keywords_df[['tmdbId', 'keywords']], 
    on='tmdbId', 
    how='inner'
)
print(f"Merge 4 ((((Ratings + Links) + Metadata) + Credits) + Keywords) complete. Final size: {len(final_merged_df)}")

Merge 4 ((((Ratings + Links) + Metadata) + Credits) + Keywords) complete. Final size: 100122


In [9]:
final_merged_df.isnull().sum()

userId                0
movieId               0
rating                0
timestamp             0
tmdbId                0
title                 0
genres                0
overview             14
vote_average          0
vote_count            0
adult                 0
original_language     0
original_title        0
popularity            0
release_date          0
cast                  0
crew                  0
keywords              0
dtype: int64

In [10]:
print("\n--- 1. Creating User-Item Sparse Matrix ---")

cf_data = final_merged_df[['userId', 'title', 'rating']]

# 1. Pivot the data to create the User-Item interaction matrix.
# Index = Movie Titles (Items)
# Columns = User IDs (Users)
# Values = Ratings

movie_features_df = cf_data.pivot_table(
    index='title', 
    columns='userId', 
    values='rating'
)

# 2. Fill NaN values (unrated movies) with 0. 
# This completes the matrix for the K-NN model.
movie_features_df = movie_features_df.fillna(0) 

print(f"User-Item Matrix created. Shape: {movie_features_df.shape}")

# 3. Convert the dense DataFrame to a Compressed Sparse Row (CSR) matrix.
# This is crucial for memory efficiency and is the optimal format for sklearn's NearestNeighbors.
from scipy.sparse import csr_matrix
movie_features_matrix = csr_matrix(movie_features_df.values)

print(f"CSR Sparse Matrix successfully created.")


--- 1. Creating User-Item Sparse Matrix ---
User-Item Matrix created. Shape: (8754, 671)
CSR Sparse Matrix successfully created.


In [11]:
import joblib
import os

# Define the model directory (ensure it exists)
MODELS_DIR = 'C:\\Md Shahid\\Liabilities\\Machine Learning Projects\\Recommendation System\\models\\'
os.makedirs(MODELS_DIR, exist_ok=True)

print("\n--- 2. Saving Artifacts to models/ ---")

# A. Save the Sparse Matrix (The numerical input for K-NN)

SPARSE_MATRIX_PATH = os.path.join(MODELS_DIR, 'movie_features_matrix.joblib')
joblib.dump(movie_features_matrix, SPARSE_MATRIX_PATH)
print(f"Saved Sparse Matrix: {SPARSE_MATRIX_PATH}")


# B. Save the Index Mapping (List of Movie Titles)

INDEX_MAP_PATH = os.path.join(MODELS_DIR, 'movie_index_map.pkl')
joblib.dump(list(movie_features_df.index), INDEX_MAP_PATH) 
print(f"Saved Movie Index Map: {INDEX_MAP_PATH}")


# C. Create and Save the Final Movie Lookup DataFrame (For the API)
# This table should contain only movie-level data, including the parsed cast/crew/keywords

final_movie_lookup_df = final_merged_df[[
    'tmdbId', 'title', 'genres', 'overview', 'vote_count', 'adult',
    'original_language', 'original_title', 'release_date',
    'cast', 'crew', 'keywords', 'popularity' 
]].drop_duplicates(subset=['tmdbId']).reset_index(drop=True)


LOOKUP_PATH = os.path.join(MODELS_DIR, 'final_movie_lookup_df.pkl')
final_movie_lookup_df.to_pickle(LOOKUP_PATH)
print(f"Saved Final Movie Lookup Table: {LOOKUP_PATH}")

print("\nPreprocessing complete. Ready for Model Training.")


--- 2. Saving Artifacts to models/ ---
Saved Sparse Matrix: C:\Md Shahid\Liabilities\Machine Learning Projects\Recommendation System\models\movie_features_matrix.joblib
Saved Movie Index Map: C:\Md Shahid\Liabilities\Machine Learning Projects\Recommendation System\models\movie_index_map.pkl
Saved Final Movie Lookup Table: C:\Md Shahid\Liabilities\Machine Learning Projects\Recommendation System\models\final_movie_lookup_df.pkl

Preprocessing complete. Ready for Model Training.


In [12]:
final_movie_lookup_df = final_merged_df[[
    'tmdbId', 'title', 'genres', 'overview', 'vote_count', 'adult',
    'original_language', 'original_title', 'release_date',
    'cast', 'crew', 'keywords', 'popularity' 
]].drop_duplicates(subset=['tmdbId']).reset_index(drop=True)

final_movie_lookup_df.to_csv(DATA_DIR_PROCESSED + 'final_movie_lookup_df.csv', index=False)