In [10]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [11]:
#Load dataset
url = "https://raw.githubusercontent.com/KBudge/lumaa-spring-2025-ai-ml/refs/heads/main/Top100IMDBMovies.csv"
movies = pd.read_csv(url)

#Only keep relevent columns
movies = movies[['rank', 'title', 'description', 'genre']]
movies = movies.set_index('rank')

#Get rid of [] and '' in the genre column
movies['genre'] = movies['genre'].str.replace("[", "")
movies['genre'] = movies['genre'].str.replace("'", "")
movies['genre'] = movies['genre'].str.replace("]", "")

#Combine title, description and genre columns into new column
movies['title_description_genre'] = movies['title'] + " " + movies['description'] + " " + movies['genre']
movies['title_description_genre'] = movies['title_description_genre'].str.replace(".", "")
movies['title_description_genre'] = movies['title_description_genre'].str.replace(",", "")

In [12]:
#Ask user to enter details for the movies they want
user_input = input("Enter what you are looking for in a movie: ")

#Function to vectorize and calculate cosine similarity and recommend top 5 movies
def recommendations(user_input, movies, top_n=5):
    
    
    
    #Make user input lowercase
    lower_input = user_input.lower()
    
    #Make title_description_genre column lowercase 
    lower_movies = movies.copy()
    lower_movies['title_description_genre'] = lower_movies['title_description_genre'].str.lower()
    
    #Vectorization with English stop words
    vec = TfidfVectorizer(stop_words='english')

    #Fit transform on the title_description_genre column from dataset
    tfidf_matrix = vec.fit_transform(lower_movies['title_description_genre'])
    
    #Transform user input using vectorization
    user_tfidf = vec.transform([lower_input])
    
    #Calculate cosine similarity between user input and dataset
    cos_sim = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
    
    #Sort to get top 5 similar movies
    top_5 = np.argsort(cos_sim)[::-1][:top_n]
    
    #Return the top 5 movies with title, desciption and simliarity scores
    return movies.iloc[top_5][['title', 'description']].assign(score=cos_sim[top_5])



#calls function to get recommendations
top_matches = recommendations(user_input, movies)

#Print the top 5 similar movies with their similarity score and description
print("\nTop 5 movie recommendations based on your input:\n")
for _, row in top_matches.iterrows():
    print(f"Similarity Score: {row['score']:.4f} - {row['title']}\n   Description: {row['description']}\n")

Enter what you are looking for in a movie: I love thrilling action movies set in space, with a comedic twist.

Top 5 movie recommendations based on your input:

Similarity Score: 0.2120 - Aliens
   Description: Fifty-seven years after surviving an apocalyptic attack aboard her space vessel by merciless space creatures, Officer Ripley awakens from hyper-sleep and tries to warn anyone who will listen about the predators.

Similarity Score: 0.1746 - The Lord of the Rings: The Fellowship of the Ring
   Description: A meek Hobbit from the Shire and eight companions set out on a journey to destroy the powerful One Ring and save Middle-earth from the Dark Lord Sauron.

Similarity Score: 0.1295 - Interstellar
   Description: A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival.

Similarity Score: 0.1074 - WALL·E
   Description: In the distant future, a small waste-collecting robot inadvertently embarks on a space journey that will ultimately decide