In [47]:
#///////////////////////////////
#DATA CLEANING AND PREPROCESSING
#///////////////////////////////

import pandas as pd
import re

# Load datasets
imdb_df = pd.read_csv('data/movie_metadata.csv')
ml_movies_df = pd.read_csv('data/movies.csv')
ml_ratings_df = pd.read_csv('data/ratings.csv')
ml_tags_df = pd.read_csv('data/tags.csv')
ml_links_df = pd.read_csv('data/links.csv')

# Extract IMDb ID from movie_imdb_link (e.g., tt1234567 → 1234567)
imdb_df['imdbId'] = imdb_df['movie_imdb_link'].apply(
    lambda x: int(re.search(r'tt(\d+)', str(x)).group(1)) if pd.notnull(x) and re.search(r'tt(\d+)', str(x)) else None
)

# Drop null imdbId rows and convert to int
imdb_df_cleaned = imdb_df.dropna(subset=['imdbId']).copy()
imdb_df_cleaned['imdbId'] = imdb_df_cleaned['imdbId'].astype(int)

# Merge MovieLens links with IMDB on imdbId
merged_df = pd.merge(ml_links_df, imdb_df_cleaned, on='imdbId', how='inner')

# Merge MovieLens movies metadata
full_merged_df = pd.merge(merged_df, ml_movies_df, on='movieId', how='inner')

# Rename and select relevant columns
full_merged_df = full_merged_df.rename(columns={
    'title': 'movie_title',
    'genres_y': 'genres',
    'movie_title': 'imdb_title'
})

movie_features_df = full_merged_df[[
    'movieId', 'movie_title', 'genres', 'director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name',
    'plot_keywords', 'imdb_score', 'title_year', 'language', 'country', 'imdbId'
]].drop_duplicates(subset='movieId')


In [48]:
#///////////////////////
#CONTENT BASED FILTERING
#///////////////////////

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Fill NaNs and build combined string for each movie
movie_features_df_filled = movie_features_df.fillna('')
movie_features_df_filled['combined_features'] = (
    movie_features_df_filled['genres'] + ' ' +
    movie_features_df_filled['director_name'] + ' ' +
    movie_features_df_filled['actor_1_name'] + ' ' +
    movie_features_df_filled['actor_2_name'] + ' ' +
    movie_features_df_filled['actor_3_name'] + ' ' +
    movie_features_df_filled['plot_keywords']
)

# Vectorize using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_features_df_filled['combined_features'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Map titles to index
title_to_index = pd.Series(movie_features_df_filled.index, index=movie_features_df_filled['movie_title'])

# Recommendation function
def get_content_recommendations(title, top_n=10):
    idx = title_to_index.get(title)
    if idx is None:
        return f"Title '{title}' not found."
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    
    movie_indices = [i[0] for i in sim_scores]
    return movie_features_df_filled[['movie_title', 'genres']].iloc[movie_indices]

# Test example
get_content_recommendations("Toy Story (1995)")


Unnamed: 0,movie_title,genres
889,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2812,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
657,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
2206,Cars (2006),Animation|Children|Comedy
797,Big (1988),Comedy|Drama|Fantasy|Romance
1318,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
1211,Atlantis: The Lost Empire (2001),Adventure|Animation|Children|Fantasy
2961,Cars 2 (2011),Adventure|Animation|Children|Comedy|IMAX
3155,Cloud Atlas (2012),Drama|Sci-Fi|IMAX
2321,Ratatouille (2007),Animation|Children|Drama


In [49]:
#////////////////
#Collaborative Based Filtering
#///////////////

import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

# Load your data
ratings_df = pd.read_csv('data/ratings.csv')
movies_df = pd.read_csv('data/movies.csv')

# Setup Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train SVD
model = SVD()
model.fit(trainset)

# Map movieId to title
movieId_to_title = pd.Series(movies_df.title.values, index=movies_df.movieId).to_dict()

# Recommendation function
def get_collab_recommendations(user_id, top_n=10):
    rated = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()
    all_movies = movies_df['movieId'].tolist()
    unseen = [m for m in all_movies if m not in rated]
    
    preds = [(m, model.predict(user_id, m).est) for m in unseen]
    top_preds = sorted(preds, key=lambda x: x[1], reverse=True)[:top_n]
    
    return [(movieId_to_title.get(mid, "Unknown"), round(score, 2)) for mid, score in top_preds]

# Example usage
print(get_collab_recommendations(user_id=1, top_n=10))




[('Casino (1995)', 5.0), ('Shawshank Redemption, The (1994)', 5.0), ('Ghost in the Shell (Kôkaku kidôtai) (1995)', 5.0), ('Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 5.0), ('Philadelphia Story, The (1940)', 5.0), ('North by Northwest (1959)', 5.0), ('Casablanca (1942)', 5.0), ('Cinema Paradiso (Nuovo cinema Paradiso) (1989)', 5.0), ("Miller's Crossing (1990)", 5.0), ('Graduate, The (1967)', 5.0)]


In [50]:
#////////////
#Search Based
#////////////

import pandas as pd

# Load metadata
movies_df = pd.read_csv('data/movies.csv')
metadata_df = pd.read_csv('data/movie_metadata.csv')

# Clean titles
movies_df['title'] = movies_df['title'].str.strip()
metadata_df['movie_title'] = metadata_df['movie_title'].str.strip()

# Merge
merged_df = pd.merge(movies_df, metadata_df, left_on='title', right_on='movie_title', how='left')

# Recommendation based on search keywords
def recommend_by_search_history(keywords, top_n=10):
    keywords = [kw.lower() for kw in keywords]
    
    def score_row(row):
        score = 0
        if any(kw in str(row.get('director_name', '')).lower() for kw in keywords):
            score += 2
        if any(kw in str(row.get('actor_1_name', '')).lower() for kw in keywords):
            score += 1
        if any(kw in str(row.get('actor_2_name', '')).lower() for kw in keywords):
            score += 1
        if any(kw in str(row.get('actor_3_name', '')).lower() for kw in keywords):
            score += 1
        if any(kw in str(row.get('genres', '')).lower() for kw in keywords):
            score += 2
        if any(kw in str(row.get('plot_keywords', '')).lower() for kw in keywords):
            score += 1
        return score

    merged_df['search_score'] = merged_df.apply(score_row, axis=1)
    top_results = merged_df.sort_values(by='search_score', ascending=False).head(top_n)
    
    return top_results[['title', 'director_name', 'actor_1_name', 'genres_x', 'search_score']]

# Example use
print(recommend_by_search_history(['nolan', 'di caprio', 'thriller'], top_n=10))


                                                  title director_name  \
0                                      Toy Story (1995)           NaN   
6497                            Ocean's Thirteen (2007)           NaN   
6490                                  Mr. Brooks (2007)           NaN   
6491  Librarian: Return to King Solomon's Mines, The...           NaN   
6492         Librarian: Quest for the Spear, The (2004)           NaN   
6493                                    Fay Grim (2006)           NaN   
6494  I'm a Cyborg, But That's OK (Saibogujiman kwen...           NaN   
6495                                  Breed, The (2006)           NaN   
6496                                    Cashback (2006)           NaN   
6498                                  Them (Ils) (2006)           NaN   

     actor_1_name                                     genres_x  search_score  
0             NaN  Adventure|Animation|Children|Comedy|Fantasy             0  
6497          NaN                     

In [51]:
#//////////////////
#HYBRID RECOMMENDER COMBINING ALL-BASED FILTERINGS
#//////////////////

import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import re

# Load datasets
imdb_df = pd.read_csv('data/movie_metadata.csv')
ml_movies_df = pd.read_csv('data/movies.csv')
ml_links_df = pd.read_csv('data/links.csv')

# Extract imdbId from movie_imdb_link
imdb_df['imdbId'] = imdb_df['movie_imdb_link'].apply(
    lambda x: int(re.search(r'tt(\d+)', str(x)).group(1)) if pd.notnull(x) and re.search(r'tt(\d+)', str(x)) else None
)
imdb_df_cleaned = imdb_df.dropna(subset=['imdbId']).copy()
imdb_df_cleaned['imdbId'] = imdb_df_cleaned['imdbId'].astype(int)

# Merge MovieLens links with IMDB metadata on imdbId
merged_df = pd.merge(ml_links_df, imdb_df_cleaned, on='imdbId', how='inner')

# Merge with MovieLens movies metadata on movieId
full_merged_df = pd.merge(merged_df, ml_movies_df, on='movieId', how='inner')
full_merged_df['title'] = full_merged_df['movie_title']


# ---------------------------
# Collaborative Filtering
# ---------------------------

def train_collaborative_model():
    reader = Reader(rating_scale=(0.5, 5.0))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
    trainset = data.build_full_trainset()
    algo = SVD()
    algo.fit(trainset)
    return algo

cf_model = train_collaborative_model()

def get_cf_scores(user_id):
    movie_ids = merged_df['movieId'].unique()
    scores = {}
    for movie_id in movie_ids:
        try:
            pred = cf_model.predict(user_id, movie_id)
            scores[movie_id] = pred.est
        except:
            scores[movie_id] = 0
    return scores

# ---------------------------
# Content-Based Filtering
# ---------------------------

def get_cb_scores(movie_title, top_n=20):
    tfidf = TfidfVectorizer(stop_words='english')
    merged_df['combined'] = (
        merged_df['genres'] + ' ' +
        merged_df['plot_keywords'] + ' ' +
        merged_df['actor_1_name'] + ' ' +
        merged_df['director_name']
    )
    merged_df['combined'] = merged_df['combined'].fillna('')
    tfidf_matrix = tfidf.fit_transform(merged_df['combined'])
    

    
    idx = merged_df[merged_df['movie_title'] == movie_title].index
    if len(idx) == 0:
        return {}
    
    cosine_sim = linear_kernel(tfidf_matrix[idx[0]], tfidf_matrix).flatten()
    scores = dict(zip(merged_df['movieId'], cosine_sim))
    return scores

# ---------------------------
# Search-Based Filtering
# ---------------------------

def get_search_scores(keywords):
    keywords = [k.lower() for k in keywords]
    
    def score_row(row):
        content = f"{row['movie_title']} {row['genres']} {row['plot_keywords']} {row['actor_1_name']} {row['actor_2_name']} {row['actor_3_name']} {row['director_name']}".lower()
        return sum(1 for k in keywords if k in content)

    merged_df['search_score'] = merged_df.apply(score_row, axis=1)
    scores = dict(zip(merged_df['movieId'], merged_df['search_score']))
    return scores

# ---------------------------
# Hybrid Recommendation
# ---------------------------

def hybrid_recommend(user_id, search_keywords, liked_movie_title=None, top_n=10):
    cf_scores = get_cf_scores(user_id)
    cb_scores = get_cb_scores(liked_movie_title) if liked_movie_title else {}
    search_scores = get_search_scores(search_keywords)
    
    all_movie_ids = set(cf_scores.keys()).union(cb_scores.keys(), search_scores.keys())
    
    hybrid_scores = {}
    for movie_id in all_movie_ids:
        score = (
            0.4 * cf_scores.get(movie_id, 0) +
            0.3 * cb_scores.get(movie_id, 0) +
            0.3 * search_scores.get(movie_id, 0)
        )
        hybrid_scores[movie_id] = score
    
    top_movies = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    recommendations = merged_df[merged_df['movieId'].isin([mid for mid, _ in top_movies])]
    
    return recommendations[['movie_title', 'genres', 'director_name', 'actor_1_name']]

# ---------------------------
# Example usage
# ---------------------------

# Sample user & history
user_id = 5
keywords = ['space', 'nolan', 'dicaprio']
liked_movie = 'Interstellar'



recs = hybrid_recommend(user_id, keywords, liked_movie, top_n=10)
print(recs)


                   movie_title                            genres  \
22         The Usual Suspects       Crime|Drama|Mystery|Thriller   
286                    Aliens            Action|Adventure|Sci-Fi   
423         L.A. Confidential       Crime|Drama|Mystery|Thriller   
711              Office Space                             Comedy   
1179                  Memento                   Mystery|Thriller   
2267             The Departed               Crime|Drama|Thriller   
2481          The Dark Knight        Action|Crime|Drama|Thriller   
2820                Inception   Action|Adventure|Sci-Fi|Thriller   
3311  The Wolf of Wall Street       Biography|Comedy|Crime|Drama   
3336             Interstellar             Adventure|Drama|Sci-Fi   

          director_name         actor_1_name  
22         Bryan Singer         Kevin Spacey  
286       James Cameron        Michael Biehn  
423       Curtis Hanson         Kevin Spacey  
711          Mike Judge            Gary Cole  
1179  Christophe