In [2]:
# %% [markdown]
# ### 1. Import Required Libraries

# %%
import pandas as pd
import numpy as np
import ast
import warnings

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings('ignore')

In [None]:
# %% [markdown]
# ### 2. Load Datasets

# %%
# Add ISO-8859-1 encoding for TMDB files
movies_tmdb = pd.read_csv(
    'tmdb_5000_movies.csv',
    low_memory=False
)
credits_tmdb = pd.read_csv(
    'tmdb_5000_credits.csv'
)


ratings = pd.read_csv('ratings_small.csv')
links = pd.read_csv('links.csv') # %% [markdown]


FileNotFoundError: [Errno 2] No such file or directory: 'ratings_small.csv'

In [1]:
# %% [markdown]
# ### 3. Merge & Clean TMDB Sample

# %%
# Rename ID column in movies dataset
movies_tmdb = movies_tmdb.rename(columns={'id': 'movie_id'})

# Merge with credits using proper suffixes
movies = movies_tmdb.merge(
    credits_tmdb.rename(columns={'movie_id': 'tmdb_id'}),  # Rename conflicting ID column
    left_on='movie_id',
    right_on='tmdb_id',
    suffixes=('_movies', '_credits')
)

# Select and rename columns properly
movies = movies[[
    'movie_id',
    'title_movies',    # From movies dataset
    'overview',
    'genres',
    'keywords',
    'cast',            # From credits
    'crew'             # From credits
]].rename(columns={
    'title_movies': 'title'
})

# Clean remaining data
movies = movies[
    movies['overview'].notna() & 
    movies['movie_id'].notna()
].copy()
movies['movie_id'] = movies['movie_id'].astype(int)
movies = movies.drop_duplicates('title').reset_index(drop=True)

NameError: name 'movies_tmdb' is not defined

In [36]:
# %% [markdown]
# ### 4. Parse JSON & Build Tags

# %%
def parse_list(col):
    return [item['name'] for item in ast.literal_eval(col)]

movies['genres'] = movies['genres'].apply(parse_list)
movies['keywords'] = movies['keywords'].apply(parse_list)
movies['cast'] = movies['cast'].apply(parse_list).apply(lambda x: x[:3])

def get_director(raw):
    return [m['name'] for m in ast.literal_eval(raw) if m.get('job')=='Director']
movies['crew'] = movies['crew'].apply(get_director)

movies['overview'] = movies['overview'].str.split()

for c in ['genres','keywords','cast','crew']:
    movies[c] = movies[c].apply(lambda lst: [s.replace(" ","") for s in lst])

movies['tags'] = (
    movies['overview'] +
    movies['genres']  +
    movies['keywords'] +
    movies['cast']    +
    movies['crew']
).apply(lambda x: " ".join(x))

In [37]:
# %% [markdown]
# ### 5. Map MovieLens → TMDB IDs

# %%
links = links.dropna(subset=['tmdbId'])
links['tmdbId'] = links['tmdbId'].astype(int)

ratings['tmdbId'] = ratings['movieId'].map(links.set_index('movieId')['tmdbId'])
ratings = ratings.dropna(subset=['tmdbId'])
ratings['tmdbId'] = ratings['tmdbId'].astype(int)

valid_ids = set(movies['movie_id'])
ratings = ratings[ratings['tmdbId'].isin(valid_ids)]

In [38]:
# %% [markdown]
# ### 6. Content-Based Similarity

# %%
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies['tags']).toarray()
content_similarity = cosine_similarity(vectors)

In [39]:
# %% [markdown]
# ### 7. Collaborative Filtering via NMF

# %%
ratings_pivot = ratings.pivot_table(
    index='userId',
    columns='tmdbId',
    values='rating'
).fillna(0)

scaler = MinMaxScaler()
R_scaled = scaler.fit_transform(ratings_pivot.values)

nmf_model = NMF(n_components=50, random_state=42, max_iter=200)
U = nmf_model.fit_transform(R_scaled)
V = nmf_model.components_.T

R_hat = np.dot(U, V.T)
predictions = scaler.inverse_transform(R_hat)

pred_df = pd.DataFrame(
    data=predictions,
    index=ratings_pivot.index,
    columns=ratings_pivot.columns
)

In [45]:
# %% [markdown]
# ### 8. Hybrid Recommendation Function

def hybrid_recommend(user_id, movie_title, top_n=5, α=0.5):
    try:
        # Movie index lookup
        idx = movies[movies['title'] == movie_title].index[0]
        
        # Get scores
        c_scores = list(enumerate(content_similarity[idx]))
        tmdb_ids = movies['movie_id'].tolist()
        
        # Collaborative predictions
        cf_scores = [pred_df.get(tmdb_id, pd.Series()).get(user_id, 0) 
                    for tmdb_id in tmdb_ids]
        
        # Combine scores
        combined = [(i, α*c + (1-α)*cf) 
                   for (i,c), cf in zip(c_scores, cf_scores)]
        
        # Get top recommendations
        top_indices = [i for i,_ in sorted(combined, key=lambda x: -x[1])[1:top_n+1]]
        
        return movies.iloc[top_indices]['title'].tolist()

    except Exception as e:
        print(f"Error: {e}")
        return movies['title'].sample(top_n).tolist()

# Sample output
print(hybrid_recommend(user_id=1, movie_title='Gandhi'))

['Casablanca', 'The Bridge on the River Kwai', 'Gandhi, My Father', 'The Wind That Shakes the Barley', 'A Passage to India']


In [46]:
# %% [markdown]
# ### 9. Save All Models

# %%
import pickle, os

os.makedirs('model', exist_ok=True)

with open('model/content.pkl','wb') as f:
    pickle.dump({'vectorizer':cv,'similarity':content_similarity}, f)

with open('model/collab.pkl','wb') as f:
    pickle.dump({'nmf':nmf_model,'pred_df':pred_df}, f)