# DCS630
# Exercise 10.2
# Justin Pizzoferrato
# 5.17.25

In [29]:
from IPython.display import display, HTML

# Enable word wrap in all code cells (for PDF export or readability)
display(HTML('''
    <style>
        div.output_area pre {
            white-space: pre-wrap;
            word-wrap: break-word;
        }
        .code_cell .input_area {
            white-space: pre-wrap;
        }
    </style>
'''))

In [31]:
# Import required libraries and load the MovieLens data

import pandas as pd

# Define paths to the ratings and movies files
movies_path = '/Users/justinpizzoferrato/Downloads/ml-latest-small/movies.csv'
ratings_path = '/Users/justinpizzoferrato/Downloads/ml-latest-small/ratings.csv'

# Load the movies and ratings data into pandas DataFrames
movies_df = pd.read_csv(movies_path)
ratings_df = pd.read_csv(ratings_path)

# Preview the first few rows of each DataFrame to confirm successful load
print("Movies Dataset:")
print(movies_df.head())

print("\nRatings Dataset:")
print(ratings_df.head())

Movies Dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings Dataset:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [33]:
# Merge movies and ratings DataFrames on 'movieId'

# Merge the ratings with movie titles using 'movieId' as key
merged_df = pd.merge(ratings_df, movies_df, on='movieId')

# Display the merged dataset to confirm structure
print("Merged Dataset:")
print(merged_df.head())

# Check for missing values in the merged dataset
print("\nMissing values:")
print(merged_df.isnull().sum())

Merged Dataset:
   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  

Missing values:
userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64


In [53]:
# Create a user-movie ratings matrix

# Create the pivot table with users as rows and movie titles as columns
ratings_matrix = merged_df.pivot_table(index='userId',
                                       columns='title',
                                       values='rating')

# Display the shape and a sample of the matrix
print(f"User-Movie Ratings Matrix Shape: {ratings_matrix.shape}")

# Only display first 10 columns to avoid overflow when exporting
ratings_matrix.iloc[:, :10].head()

User-Movie Ratings Matrix Shape: (610, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
5,,,,,,,,,,


In [37]:
# Create a function to recommend similar movies based on cosine similarity

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Fill missing values with 0 to prepare for cosine similarity
ratings_matrix_filled = ratings_matrix.fillna(0)

# Compute cosine similarity between movie columns (transpose to align movies as rows)
movie_similarity = cosine_similarity(ratings_matrix_filled.T)

# Create a DataFrame for easier lookup
similarity_df = pd.DataFrame(movie_similarity,
                             index=ratings_matrix.columns,
                             columns=ratings_matrix.columns)

# Function to recommend similar movies
def recommend_movies(movie_title, top_n=5):
    if movie_title not in similarity_df.columns:
        return f"Movie '{movie_title}' not found in dataset."
    similar_scores = similarity_df[movie_title].sort_values(ascending=False)
    return similar_scores[1:top_n+1]

# Example usage
print("Top 5 movies similar to 'Toy Story (1995)':")
print(recommend_movies('Toy Story (1995)'))

Top 5 movies similar to 'Toy Story (1995)':
title
Toy Story 2 (1999)                           0.572601
Jurassic Park (1993)                         0.565637
Independence Day (a.k.a. ID4) (1996)         0.564262
Star Wars: Episode IV - A New Hope (1977)    0.557388
Forrest Gump (1994)                          0.547096
Name: Toy Story (1995), dtype: float64


In [38]:
# Display random sample of movie titles to assist the user
def show_sample_movies(n=10):
    print("Sample movie titles in the dataset:")
    print(np.random.choice(ratings_matrix.columns, size=n, replace=False))

# Run this anytime
show_sample_movies()

Sample movie titles in the dataset:
["Jonah Who Will Be 25 in the Year 2000 (Jonas qui aura 25 ans en l'an 2000) (1976)"
 'Futurama: The Beast with a Billion Backs (2008)' 'Octagon, The (1980)'
 'Morning Glory (2010)' 'Letter, The (1940)' 'Arthur (1981)'
 'Transporter, The (2002)' 'Boot, Das (Boat, The) (1981)'
 'Kill Command (2016)' 'Friends with Benefits (2011)']


In [51]:
# Enhanced movie recommendation with fuzzy matching

from difflib import get_close_matches

def recommend_movies_fuzzy(user_input, top_n=5):
    # Get the list of movie titles from the similarity DataFrame
    all_titles = similarity_df.columns.tolist()
    
    # Find close matches to the user input (case-insensitive)
    matches = get_close_matches(user_input, all_titles, n=1, cutoff=0.6)
    
    if not matches:
        return f"No close match found for '{user_input}'. Try a different title."
    
    best_match = matches[0]
    print(f"Did you mean: '{best_match}'?\n")

    # Return top N recommendations for the best-matched title
    similar_scores = similarity_df[best_match].sort_values(ascending=False)
    return similar_scores[1:top_n+1]

# Example usage
user_input = input("Enter a movie title: ")
recommendations = recommend_movies_fuzzy(user_input)
print(f"\nTop recommended movies based on your input '{user_input}':")
print(recommendations)

Enter a movie title:  Gremlins


Did you mean: 'Gremlins (1984)'?


Top recommended movies based on your input 'Gremlins':
title
RoboCop (1987)                      0.587486
Goonies, The (1985)                 0.542817
Lost Boys, The (1987)               0.517656
Gremlins 2: The New Batch (1990)    0.505242
Fly, The (1986)                     0.505007
Name: Gremlins (1984), dtype: float64
