In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv", low_memory=False)

# Initialize TF-IDF vectorizer for movie titles
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies['title'])

#high_ratings contains only the highest rated movies by all users
high_ratings = ratings[ratings['rating'] >= 4] 

#groups movies that are liked by a user
grouped_ratings = high_ratings.groupby('userId') 
liked_movies_by_user = grouped_ratings['movieId'].apply(list).reset_index()

#Value used for identifying which user we are doing the search for
target_user_id = 47

# Get movies liked by the target user
target_user_movies = set(liked_movies_by_user[liked_movies_by_user['userId'] == target_user_id]['movieId'].iloc[0])

# Calculate Jaccard similarity with other users
user_similarity = []
for index, row in liked_movies_by_user.iterrows():
    if row['userId'] != target_user_id:
        other_user_movies = set(row['movieId'])
        intersection = len(target_user_movies.intersection(other_user_movies))
        union = len(target_user_movies.union(other_user_movies))
        similarity = intersection / union
        user_similarity.append({'userId': row['userId'], 'similarity': similarity})

# Convert similarity results to DataFrame
user_similarity_df = pd.DataFrame(user_similarity)

# Sort DataFrame by similarity in descending order
top_similar_users = user_similarity_df.sort_values(by='similarity', ascending=False)

top_three_similar_users = top_similar_users.head(3)['userId'].tolist()

movie_ids = []
for user in top_three_similar_users:
    user_ratings = grouped_ratings.get_group(user)
    user_ratings = user_ratings['movieId'].tolist()
    movie_ids = movie_ids + user_ratings 

#remove duplicate titles
movie_ids = set(movie_ids)
movie_ids = list(movie_ids)

target_user_list = grouped_ratings.get_group(1)['movieId'].tolist()

#removes movies already watched by target user
for movie in movie_ids:
    if movie in target_user_list:
        movie_ids.remove(movie)


genre_preferences = {}
genre_list = []

# Iterate through the target user's movie list to create a genre preferences dictionary
for movie in target_user_movies:
    movie_row = movies[movies['movieId'] == movie]
    genres = movie_row['genres'].values[0]
    genre_list += genres.split('|')

for genre in genre_list:
    genre_preferences[genre] = genre_preferences.get(genre,0) + 1

# Calculate genre scores for each recommended movie
movie_genre_score = {}

for movie in movie_ids:
    movie_genre_score[movie] = 0
    genre_list = []
    movie_row = movies[movies['movieId'] == movie]
    genres = movie_row['genres'].values[0]
    genre_list += genres.split('|')
    for genre in genre_list:
        for key in genre_preferences:
            if genre == key:
                movie_genre_score[movie] = movie_genre_score[movie] + genre_preferences[key]

sorted_dict = dict(sorted(movie_genre_score.items(), key=lambda item: item[1], reverse=True))

#prints movie titles
for key in sorted_dict:
    movie_title = movies.loc[movies['movieId'] == key, 'title'].values
    print(movie_title)


['True Lies (1994)']
['Pulp Fiction (1994)']
['Aladdin (1992)']
['Clear and Present Danger (1994)']
['Dances with Wolves (1990)']
['Apollo 13 (1995)']
['Silence of the Lambs, The (1991)']
['Beauty and the Beast (1991)']
['Ace Ventura: Pet Detective (1994)']
['Fugitive, The (1993)']


NameError: name 'widgets' is not defined

        userId                                            movieId
0            1  [1, 110, 158, 260, 356, 596, 1036, 1066, 1210,...
1            2  [1, 6, 17, 21, 34, 36, 47, 50, 110, 141, 150, ...
2            3  [296, 318, 858, 2959, 3114, 3751, 4886, 6377, ...
3            4  [260, 318, 356, 595, 915, 2324, 2858, 4306, 56...
4            5  [47, 175, 257, 318, 319, 337, 527, 778, 1147, ...
...        ...                                                ...
324411  330971  [50, 150, 260, 296, 509, 527, 541, 593, 858, 9...
324412  330972  [3, 5, 6, 7, 10, 11, 17, 34, 47, 50, 62, 141, ...
324413  330973  [1777, 2959, 4149, 4226, 5267, 7149, 31685, 58...
324414  330974  [50, 110, 260, 318, 527, 587, 597, 780, 1193, ...
324415  330975  [69, 265, 431, 497, 912, 923, 1060, 1089, 1221...

[324416 rows x 2 columns]


## similar_users_recomendations = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"] #Returns moveis that similar users liked