# Movie Recommender — Build & Save Notebook

This notebook builds **content-based** (TF-IDF) and **collaborative** (Surprise SVD) recommenders, saves model artifacts to `/mnt/data/recommender_models/`, and demonstrates usage.

**Notes before running**
- Place `movies_metadata.csv` in `/mnt/data/` or the notebook's working directory. If you want SVD/collaborative model, also place `ratings.csv` or `ratings_small.csv` in the same folder.
- The environment may or may not allow `pip install scikit-surprise`. If internet install fails, the notebook will still build the content-based model and skip SVD.
- Your previously uploaded notebook is available at:

```
/mnt/data/movie-recommender-systems (1).ipynb
```
You can open or inspect it for reference.

In [None]:
# Optional: install scikit-surprise if you want collaborative SVD (internet required)
# Uncomment if you have internet access
# !pip install scikit-surprise joblib

# Standard imports
%matplotlib inline
import os, warnings, json, glob
warnings.simplefilter('ignore')

import pandas as pd, numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import joblib, pickle

# Try import surprise (will be used only if available)
try:
    from surprise import Reader, Dataset, SVD
    from surprise.model_selection import train_test_split
    from surprise import accuracy
    SURPRISE_AVAILABLE = True
except Exception:
    SURPRISE_AVAILABLE = False

print('Surprise available:', SURPRISE_AVAILABLE)

In [None]:
# Locate movies and ratings files (search common locations)
candidates = glob.glob('/mnt/data/**/movies*.csv', recursive=True) + glob.glob('movies*.csv') + glob.glob('/mnt/data/**/movies_metadata*.csv', recursive=True)
movies_path = candidates[0] if candidates else None
# Fallback common names
for name in ['movies_metadata.csv','movies.csv','movies_metadata_clean.csv']:
    if movies_path is None and os.path.exists(os.path.join('/mnt/data', name)):
        movies_path = os.path.join('/mnt/data', name)

ratings_candidates = glob.glob('/mnt/data/**/ratings*.csv', recursive=True) + glob.glob('ratings*.csv')
ratings_path = ratings_candidates[0] if ratings_candidates else None

print('Movies path found ->', movies_path)
print('Ratings path found ->', ratings_path)
if movies_path is None:
    raise FileNotFoundError('movies metadata CSV not found. Upload movies_metadata.csv to /mnt/data or working directory before running.')

In [None]:
# Load movies metadata robustly and prepare fields
def load_movies(path):
    try:
        df = pd.read_csv(path, engine='python', on_bad_lines='skip')
    except Exception:
        df = pd.read_csv(path, on_bad_lines='skip', low_memory=False)
    return df

md = load_movies(movies_path)
print('Loaded movies:', md.shape)

for col in ['genres','overview','title','id','release_date']:
    if col not in md.columns:
        md[col] = np.nan

# clean genres (expects JSON-like lists as strings)
md['genres'] = md['genres'].fillna('[]').apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
md['genres'] = md['genres'].apply(lambda x: [d['name'] for d in x] if isinstance(x, list) else [])
md['overview'] = md['overview'].fillna('')
md['title'] = md['title'].fillna('')
md['id'] = md['id'].astype(str).fillna('')

# create a 'soup' of text features for content-based model
md['soup'] = md.apply(lambda x: ' '.join(x['genres']) + ' ' + x['overview'], axis=1)

# show few rows
md[['title','release_date','genres']].head()

In [None]:
# Build TF-IDF content-based model and save artifacts
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(md['soup'].values.astype('U'))
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# mapping title -> index (handle duplicates by taking first)
indices = pd.Series(md.index, index=md['title']).drop_duplicates()

out_dir = '/mnt/data/recommender_models'
os.makedirs(out_dir, exist_ok=True)
joblib.dump(tfidf, os.path.join(out_dir, 'tfidf_vectorizer.joblib'))
joblib.dump(cosine_sim, os.path.join(out_dir, 'cosine_sim.joblib'), compress=3)
md.to_pickle(os.path.join(out_dir, 'movies_metadata_df.pkl'))
indices.to_pickle(os.path.join(out_dir, 'title_indices.pkl'))

print('Saved content-based artifacts to', out_dir)
print('Files:', os.listdir(out_dir))

In [None]:
# Helper: get content-based recommendations by title
def get_content_recommendations(title, topn=10):
    if title not in indices.index:
        matches = md[md['title'].str.contains(title, case=False, na=False)]
        if matches.shape[0] == 0:
            return None
        idx = matches.index[0]
    else:
        idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1: topn+1]
    movie_indices = [i[0] for i in sim_scores]
    results = md.iloc[movie_indices][['title','release_date','id']].copy()
    results['score'] = [i[1] for i in sim_scores]
    return results.reset_index(drop=True)

# Demo
sample = md['title'].dropna().iloc[0]
print('Sample movie:', sample)
print(get_content_recommendations(sample, topn=5).to_string(index=False))

In [None]:
# Optional: build Surprise SVD collaborative model if ratings CSV present and surprise available
svd_path = os.path.join(out_dir, 'svd_model.pkl')
if SURPRISE_AVAILABLE and ratings_path is not None:
    try:
        ratings = pd.read_csv(ratings_path)
    except Exception:
        ratings = pd.read_csv(ratings_path, engine='python', on_bad_lines='skip')
    print('Ratings loaded:', ratings.shape)
    # try to detect columns
    cols = [c.lower() for c in ratings.columns]
    mapping = {}
    mapping['user'] = ratings.columns[0]
    mapping['item'] = ratings.columns[1] if len(ratings.columns)>1 else ratings.columns[0]
    mapping['rating'] = ratings.columns[2] if len(ratings.columns)>2 else ratings.columns[-1]
    ratings_ren = ratings.rename(columns={mapping['user']:'userId', mapping['item']:'movieId', mapping['rating']:'rating'})[['userId','movieId','rating']].dropna()
    ratings_ren['userId'] = ratings_ren['userId'].astype(str)
    ratings_ren['movieId'] = ratings_ren['movieId'].astype(str)
    ratings_ren['rating'] = ratings_ren['rating'].astype(float)

    reader = Reader(rating_scale=(ratings_ren['rating'].min(), ratings_ren['rating'].max()))
    data = Dataset.load_from_df(ratings_ren[['userId','movieId','rating']], reader)
    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

    algo = SVD(n_factors=100, n_epochs=20, random_state=42)
    print('Training SVD (may take time)...')
    algo.fit(trainset)
    preds = algo.test(testset)
    rmse = accuracy.rmse(preds, verbose=False)
    print('SVD RMSE:', rmse)
    with open(svd_path,'wb') as f:
        pickle.dump(algo, f)
    print('Saved SVD model to', svd_path)
else:
    print('Skipping SVD. Either Surprise not installed or ratings CSV missing.')

In [None]:
# Quick usage examples (load saved artifacts and query)
# Load artifacts
import joblib, pandas as pd, pickle
tfidf = joblib.load('/mnt/data/recommender_models/tfidf_vectorizer.joblib')
cosine_sim = joblib.load('/mnt/data/recommender_models/cosine_sim.joblib')
md = pd.read_pickle('/mnt/data/recommender_models/movies_metadata_df.pkl')
indices = pd.read_pickle('/mnt/data/recommender_models/title_indices.pkl')

# Example: content recommendations
print(get_content_recommendations(sample, topn=5))

# Example: SVD prediction (if saved)
svd_path = '/mnt/data/recommender_models/svd_model.pkl'
if os.path.exists(svd_path):
    algo = pickle.load(open(svd_path,'rb'))
    print('Example SVD predict:', algo.predict(uid=str(1), iid=str(1)))

---
**After running this notebook** the artifacts will be in `/mnt/data/recommender_models/`.  
You can download them from the Files pane or via links if your environment supports it.

**Uploaded notebook (for reference):** `/mnt/data/movie-recommender-systems (1).ipynb`

If you want, I can now run this notebook here and produce the artifacts — upload the needed CSVs (`movies_metadata.csv` and optionally `ratings.csv`) into `/mnt/data/` and tell me to run it. Or download the notebook from the link below and run locally/Colab.

---