

In [2]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split


In [9]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]


In [10]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def fetch_director(obj):
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            return [i['name']]
    return []

movies.dropna(inplace=True)
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)[:3]])
movies['crew'] = movies['crew'].apply(fetch_director)
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new_df = movies[['movie_id', 'title', 'tags']]
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [11]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()
similarity = cosine_similarity(vectors)

def content_recommend(movie):
    idx = new_df[new_df['title'] == movie].index[0]
    distances = similarity[idx]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:11]
    return [new_df.iloc[i[0]].title for i in movie_list]


In [12]:
ratings = pd.read_csv('data/ratings.dat', sep='::', engine='python',
                      names=['userId', 'movieId', 'rating', 'timestamp'],
                      encoding='ISO-8859-1')

movies_ml = pd.read_csv('data/movies.dat', sep='::', engine='python',
                        names=['movieId', 'title', 'genres'],
                        encoding='ISO-8859-1')

ratings.drop('timestamp', axis=1, inplace=True)


In [13]:
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)
model = SVD()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x28117fecdf0>

In [14]:
from difflib import get_close_matches
import numpy as np

def hybrid_recommend(movie_title, user_id, alpha=0.6):
    movie_title_input = movie_title.strip().lower()

    # Try case-insensitive exact match
    match_df = new_df[new_df['title'].str.lower().str.strip() == movie_title_input]

    if match_df.empty:
        # Fuzzy match fallback (lowercase titles)
        all_titles = new_df['title'].str.lower().tolist()
        close_matches = get_close_matches(movie_title_input, all_titles, n=1, cutoff=0.4)
        if close_matches:
            matched_lower = close_matches[0]
            match_df = new_df[new_df['title'].str.lower() == matched_lower]
            print(f"⚠️ Using closest match: {match_df.iloc[0]['title']}")
        else:
            return "❌ Movie not found"

    # Get index of matched movie
    idx = match_df.index[0]

    # Content-based similarity
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:21]
    movie_indices = [i[0] for i in sim_scores]
    movie_ids = new_df.iloc[movie_indices].index.tolist()

    # Collaborative filtering predictions
    collab_scores = [model.predict(user_id, m).est for m in movie_ids]

    # Combine scores
    content_scores = np.array([s[1] for s in sim_scores])
    collab_scores = np.array(collab_scores)
    hybrid_scores = alpha * content_scores + (1 - alpha) * collab_scores

    top_indices = np.argsort(hybrid_scores)[::-1][:10]
    return new_df.iloc[[movie_indices[i] for i in top_indices]][['title']]


In [15]:
hybrid_recommend('Avatar', user_id=10)


Unnamed: 0,title
2405,Aliens
838,Alien³
47,Star Trek Into Darkness
300,Starship Troopers
1533,Moonraker
3161,Alien
3735,Cargo
1080,Dune
4338,Silent Running
158,Star Trek


In [16]:
# See similar movies (content only)
content_recommend('Inception')



['The Helix... Loaded',
 'The Count of Monte Cristo',
 'Flatliners',
 'Cypher',
 'Transformers: Revenge of the Fallen',
 'Hesher',
 'Looper',
 'Duplex',
 'Timecop',
 '9']

In [17]:
# Hybrid personalized recommendation
hybrid_recommend('Inception', user_id=42)

Unnamed: 0,title
1272,Pandorum
1788,Flatliners
3005,The Lawnmower Man
134,Mission: Impossible - Rogue Nation
4407,The Helix... Loaded
1934,Sheena
1268,Duplex
4493,Hav Plenty
1194,The Count of Monte Cristo
920,Congo


In [18]:
hybrid_recommend('avengers', user_id=10)


⚠️ Using closest match: The Avengers


Unnamed: 0,title
26,Captain America: Civil War
232,The Wolverine
7,Avengers: Age of Ultron
215,Fantastic 4: Rise of the Silver Surfer
203,X2
85,Captain America: The Winter Soldier
182,Ant-Man
101,X-Men: First Class
31,Iron Man 3
511,X-Men


In [19]:
hybrid_recommend('interstellar', user_id=10)


Unnamed: 0,title
149,Armageddon
1533,Moonraker
3735,Cargo
1711,Space Pirate Captain Harlock
2969,2001: A Space Odyssey
4338,Silent Running
1080,Dune
158,Star Trek
1761,The Right Stuff
2017,Spaceballs


In [20]:
hybrid_recommend('spider man', user_id=10)


⚠️ Using closest match: Spider-Man


Unnamed: 0,title
232,The Wolverine
30,Spider-Man 2
1297,Superman III
5,Spider-Man 3
1536,Arachnophobia
511,X-Men
1120,21 Jump Street
7,Avengers: Age of Ultron
3221,The DUFF
2919,High School Musical 3: Senior Year
