In [None]:
# how to connect drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Import necessary libaries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import difflib

Load Dataset

In [None]:
movies = pd.read_csv("/content/drive/MyDrive/movie dataset/movies.csv")
ratings = pd.read_csv("/content/drive/MyDrive/movie dataset/ratings.csv")
tags = pd.read_csv("/content/drive/MyDrive/movie dataset/tags.csv")

print("Movies:", movies.shape)
print("Ratings:", ratings.shape)
print("Tags:", tags.shape)

Movies: (10, 3)
Ratings: (31, 4)
Tags: (5, 4)


PreProcessing

In [None]:
def parse_genres(g):
    if pd.isna(g): return []
    if g == "(no genres listed)": return []
    return [x.strip().lower() for x in g.split('|')]

movies['genres_list'] = movies['genres'].apply(parse_genres)
movies['genres_str'] = movies['genres_list'].apply(lambda xs: ' '.join(xs))

# Aggregate tags by movie
tags['tag'] = tags['tag'].astype(str).str.lower()
tags_agg = tags.groupby('movieId')['tag'].apply(lambda x: " ".join(x)).reset_index()

# Merge
movies_cb = movies.merge(tags_agg, on="movieId", how="left")
movies_cb['tag'] = movies_cb['tag'].fillna("")
movies_cb['text'] = (movies_cb['genres_str'] + " " + movies_cb['tag']).str.strip()

Content-Based Filtering

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_cb['text'])

titles = movies_cb['title'].tolist()
title_to_index = {t: i for i, t in enumerate(titles)}

def best_title_match(query_title, cutoff=0.6):
    matches = difflib.get_close_matches(query_title, titles, n=1, cutoff=cutoff)
    if matches:
        return title_to_index[matches[0]]
    return None

def recommend_content(movie_name, top_n=5):
    idx = best_title_match(movie_name)
    if idx is None:
        return f"❌ '{movie_name}' not found."
    sim = linear_kernel(tfidf_matrix[idx], tfidf_matrix).ravel()
    sim[idx] = -1  # exclude itself
    top_idx = sim.argsort()[-top_n:][::-1]
    return movies_cb.iloc[top_idx][['title','genres']]

 Collaborative Filtering

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Pivot ratings (user x movie matrix)
user_movie_ratings = ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)

# Compute similarity between users
user_similarity = cosine_similarity(user_movie_ratings)
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_ratings.index, columns=user_movie_ratings.index)

def recommend_collaborative(user_id, top_n=5):
    if user_id not in user_movie_ratings.index:
        return f"❌ User {user_id} not found."
    # Similarity scores for the user
    sim_scores = user_similarity_df[user_id].drop(user_id)
    # Pick most similar user
    most_similar = sim_scores.idxmax()
    # Movies rated by similar user but not by target user
    target_movies = user_movie_ratings.loc[user_id]
    similar_movies = user_movie_ratings.loc[most_similar]
    recs = similar_movies[(target_movies==0) & (similar_movies>0)].sort_values(ascending=False).head(top_n)
    return movies[movies['movieId'].isin(recs.index)][['title','genres']]

In [None]:
print("\n🎬 Content-based (similar to 'Inception'):")
print(recommend_content("Inception"))

print("\n👤 Collaborative (recommendations for user 1):")
print(recommend_collaborative(1))


🎬 Content-based (similar to 'Inception'):
                      title                   genres
7           Iron Man (2008)            Action|Sci-Fi
6  Avengers: Endgame (2019)  Action|Adventure|Sci-Fi
4     Shutter Island (2010)         Mystery|Thriller
3       The Prestige (2006)     Drama|Mystery|Sci-Fi
1       Interstellar (2014)   Adventure|Drama|Sci-Fi

👤 Collaborative (recommendations for user 1):
                 title                        genres
3  The Prestige (2006)          Drama|Mystery|Sci-Fi
8     Toy Story (1995)     Animation|Children|Comedy
9  Finding Nemo (2003)  Animation|Adventure|Children
