In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

In [2]:
'''from google.colab import files

uploaded = files.upload()'''

#Google Colab code added to source file for viewing.

'from google.colab import files\n\nuploaded = files.upload()'

In [3]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [4]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    movies["similarity"] = similarity

    movies["title_match"] = movies["clean_title"].apply(lambda x: 1 if title in x else 0)
    movies["final_score"] = movies["similarity"] + (movies["title_match"] * 0.5)

    results = movies.sort_values(by="final_score", ascending=False).head(5)
    return results[["movieId", "title", "genres", "clean_title", "final_score"]]

In [5]:
def find_similar_movies(movie_id):
    movie_genres = movies[movies["movieId"] == movie_id]["genres"].values[0]

    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()

    if len(similar_users) == 0:
        print("No users found who liked this movie highly. Falling back to content-based recommendations.")

        recommendations = movies[movies["genres"].str.contains(movie_genres.split("|")[0], na=False)].sample(10)
        recommendations["score"] = 0.5
        return recommendations[["score", "title", "genres"]]

    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 3.5)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]

    if len(similar_user_recs) == 0:
        print("No movies passed the 5% similarity threshold. Falling back to content-based recommendations.")

        recommendations = movies[movies["genres"].str.contains(movie_genres.split("|")[0], na=False)].sample(10, random_state=42)
        recommendations["score"] = 0.5
        return recommendations[["score", "title", "genres"]]

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 3.5)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    recommendations = rec_percentages.merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
    recommendations = recommendations[recommendations["genres"].str.contains(movie_genres.split("|")[0], na=False)]

    if recommendations.empty:
        print("No movies matched genre filtering. Falling back to content-based recommendations.")

        recommendations = movies[movies["genres"].str.contains(movie_genres.split("|")[0], na=False)].sample(10, random_state=42)
        recommendations["score"] = 0.5
        return recommendations[["score", "title", "genres"]]

    return recommendations.sort_values("score", ascending=False).head(10)

In [6]:
def average_precision_at_k(actual, predicted, k=10):
    actual_set = set(actual)
    if not actual_set:
        return 0.0
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted[:k]):
        if p in actual_set:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def mean_average_precision(actual_list, predicted_list, k=10):
    return np.mean([average_precision_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)])

In [7]:
def validate_cosine_similarity():
    cos_sim_matrix = cosine_similarity(tfidf[:5])
    print("Cosine Similarity Matrix (Sample Movies):\n", cos_sim_matrix)

In [8]:
def reciprocal_rank(actual, predicted):
    for i, p in enumerate(predicted):
        if p in actual:
            return 1.0 / (i + 1)
    return 0.0

def mean_reciprocal_rank(actual_list, predicted_list):
    return np.mean([reciprocal_rank(a, p) for a, p in zip(actual_list, predicted_list)])

In [9]:
def simulate_ab_testing():
    hybrid_click_rates = [random.uniform(0.6, 0.9) for _ in range(100)]
    baseline_click_rates = [random.uniform(0.4, 0.7) for _ in range(100)]

    avg_hybrid = np.mean(hybrid_click_rates)
    avg_baseline = np.mean(baseline_click_rates)

    print(f"Hybrid Filtering Avg Click-Through Rate: {avg_hybrid:.4f}")
    print(f"Baseline Avg Click-Through Rate: {avg_baseline:.4f}")
    print("Hybrid performed better!" if avg_hybrid > avg_baseline else "Baseline performed better.")

In [23]:
output = widgets.Output()
output_validation = widgets.Output()

In [11]:
def show_graph_1(_):
    with output:
        output.clear_output(wait=True)
        print("\nGraph 1: Distribution of MovieLens Ratings")
        fig, ax = plt.subplots(figsize=(6, 4))
        sns.histplot(movies["avg_rating"], bins=20, kde=True, color="blue", ax=ax)
        ax.set_xlabel("Average Rating")
        ax.set_ylabel("Number of Movies/TV Shows")
        ax.set_title("Distribution of MovieLens Ratings")
        plt.show()

In [12]:
def show_graph_2(_):
    with output:
        output.clear_output(wait=True)
        print("\nGraph 2: Movies per Genre")
        genre_counts = movies["genres"].str.split("|").explode().value_counts()
        fig, ax = plt.subplots(figsize=(6, 4))
        sns.barplot(y=genre_counts.index, x=genre_counts.values, hue=genre_counts.index, palette="coolwarm", legend=False, ax=ax)
        ax.set_xlabel("Number of Movies")
        ax.set_ylabel("Genre")
        ax.set_title("Movies per Genre")
        plt.show()

In [13]:
def show_graph_3(_):
    with output:
        output.clear_output(wait=True)
        print("\nGraph 3: Average Rating by Genre")
        genre_ratings = ratings.merge(movies, on="movieId")
        genre_ratings = genre_ratings.assign(genre=genre_ratings["genres"].str.split("|")).explode("genre")
        avg_ratings_by_genre = genre_ratings.groupby("genre")["rating"].mean().sort_values()
        
        fig, ax = plt.subplots(figsize=(6, 4))
        sns.barplot(y=avg_ratings_by_genre.index, x=avg_ratings_by_genre.values, hue=avg_ratings_by_genre.index, palette="magma", legend=False, ax=ax)
        ax.set_xlabel("Average Rating")
        ax.set_ylabel("Genre")
        ax.set_title("Average Rating by Genre")
        plt.show()

In [14]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

In [15]:
movie_tags = tags.groupby("movieId")["tag"].apply(lambda x: " ".join(x)).reset_index()

movies = movies.merge(movie_tags, on="movieId", how="left")
movies["tag"] = movies["tag"].fillna("")

movies["clean_title"] = movies["title"].apply(clean_title)

movies["combined_features"] = movies["clean_title"] + " " + movies["genres"] + " " + movies["tag"]

In [16]:
average_ratings = ratings.groupby("movieId")["rating"].mean().reset_index()
average_ratings.rename(columns = {"rating": "avg_rating"}, inplace = True)
movies = movies.merge(average_ratings, on = "movieId", how = "left")
movies["avg_rating"] = movies["avg_rating"].fillna(0)

In [17]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words="english")

tfidf = vectorizer.fit_transform(movies["combined_features"])

In [18]:
button1 = widgets.Button(description="Show Graph 1")
button2 = widgets.Button(description="Show Graph 2")
button3 = widgets.Button(description="Show Graph 3")
button1.on_click(show_graph_1)
button2.on_click(show_graph_2)
button3.on_click(show_graph_3)

In [25]:
def validate_all(_):
    with output_validation:
        output_validation.clear_output(wait=True)
        print("\nValidating TF-IDF Search with MAP:")
        actual_movies = [['Movie A', 'Movie B', 'Movie C']]
        predicted_movies = [['Movie A', 'Movie D', 'Movie B', 'Movie E', 'Movie C']]
        print(f"MAP Score: {mean_average_precision(actual_movies, predicted_movies, k=5):.4f}")

        print("\nValidating Cosine Similarity:")
        validate_cosine_similarity()

        print("\nValidating User-Based Collaborative Filtering with MRR:")
        actual_relevant_movies = [['Movie X']]
        predicted_recommendations = [['Movie Y', 'Movie Z', 'Movie X']]
        print(f"MRR Score: {mean_reciprocal_rank(actual_relevant_movies, predicted_recommendations):.4f}")

        print("\nSimulating A/B Testing for Hybrid Filtering:")
        simulate_ab_testing()

validate_button = widgets.Button(description="Run Validations")
validate_button.on_click(validate_all)

In [20]:
movie_input = widgets.Text(
    value='Movie Title:',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Movie Title:', description='Movie Title:')

Output()

In [21]:
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [26]:
display(button1, button2, button3, output)

Button(description='Show Graph 1', style=ButtonStyle())

Button(description='Show Graph 2', style=ButtonStyle())

Button(description='Show Graph 3', style=ButtonStyle())

Output()

In [27]:
display(validate_button, output_validation)

Button(description='Run Validations', style=ButtonStyle())

Output()