In [69]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np

In [70]:
#Block One: First Filtering
#reading the CSV files, change them on your personal computer to wherever you saved them
rate = pd.read_csv(r'ml-latest-small/ratings.csv')
movies = pd.read_csv(r'ml-latest-small/movies.csv')
ratings = pd.read_csv(r'ml-latest-small/ratings.csv')
tags = pd.read_csv(r'ml-latest-small/tags.csv')
userID = ratings
#dropping timestamp as it's unnecessary
tags = tags.drop(columns='timestamp')
ratings = ratings.drop(columns='timestamp') 

#Groups on userId and movieId and groups tags into a list
tags = tags.groupby(['movieId']).agg({'tag':set}).reset_index()

#Refining ratings to just be movieId and the average rating to only 1 decimal point
ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
ratings = ratings.rename(columns={'rating':'average_rating'})
ratings['average_rating'] = ratings['average_rating'].round(1)
filtered_data = pd.merge(movies,ratings, on='movieId')
filtered_data = pd.merge(filtered_data, tags, on='movieId')

In [71]:
#Block Two: Unique Genres and Second Filtering
# Split genres by '|' and get all unique genres
unique_genres = set()
for genres in movies['genres']:
    unique_genres.update(genres.split('|'))
unique_genres = sorted(unique_genres)  # Sort in alphabetic order
for genre in unique_genres:
    filtered_data[genre] = filtered_data['genres'].apply(lambda x: 1 if genre in x.split('|') else 0)
#filter out In Netflix Queue
filtered_data['tag'] = filtered_data['tag'].apply(lambda tag_set: {tag for tag in tag_set if tag != "In Netflix queue"})

# Drop the original 'genres' column (I think it's a good idea, no need to clutter the data yk?)
filtered_data= filtered_data.drop(columns=['genres'])

In [None]:
#Block Three: Vectorization
#I changed the vector size to 10 just cause it's smaller but if you wanna increase it by all means go for it
wordVector = Word2Vec(filtered_data['tag'].tolist(),vector_size=10, window=5, min_count=1, workers=4)

vector_size = 10  # Same size as Word2Vec vectors
padding_vector = np.zeros(vector_size)
# Function to generate a vector for up to 3 tags
def create_feature_vector(row):
    # Get word vectors for the tags, up to 3 tags, and pad if fewer
    tag_vectors = [
        wordVector.wv[tag] if tag in wordVector.wv else padding_vector
        for tag in list(row['tag'])[:3]
    ]
    while len(tag_vectors) < 3:  # Pad with zero vectors if fewer than 3 tags
        tag_vectors.append(padding_vector)
    
    # Flatten the tag vectors (3 vectors of size 10 each -> 30 elements)
    tag_vector = np.concatenate(tag_vectors)
    
    # Add genre one-hot encoding
    genre_vector = row[unique_genres].values  # One-hot encoded genres
    
    # Combine tag vector and genre vector
    feature_vector = np.concatenate([tag_vector, genre_vector])
    return feature_vector
filtered_data['feature_vector'] = filtered_data.apply(create_feature_vector, axis=1)

#Okay so I mixed the tag vector and the genre vectors into one feature vector
#I am going to drop the original tag vector and the genres
filtered_data = filtered_data.drop(columns=list(unique_genres))
filtered_data = filtered_data.drop(columns='tag')

#wrote to file to see feature data
# filtered_data.to_csv('filtered_movies.csv',index=False)

In [7]:
#Block 4: The actual recommendation

#TODO: implement the test and training sets split

In [159]:
#Query is the movie we want to isolate
#Cosine Similarities
def knn_recommendation_cos(query,train_data, user_watched_movies, k=5):
    #Extract ratings and feature vectors from training set
    #Ratings are optional if you don't want them, I have them here to use as bias
    train_features = np.array(train_data['feature_vector'].tolist())
    train_ratings = train_data['average_rating'].values#Optional

    #kNN using cosine_similarity
    similarities = cosine_similarity([query],train_features).flatten()

    #Optional Bias
    weighted_similarities = similarities * train_ratings

    # #get the indicies of nearest movies (5)
    # kNN_indices = np.argsort(weighted_similarities)[-k:][::-1] #Sorted by weighted similariy in descending order

    # #Get the movies from the index
    # kNN_movies = train_data.iloc[kNN_indices]
        # Get the indices of nearest movies (more than k initially)
    kNN_indices = np.argsort(weighted_similarities)[-k*2:][::-1]  # Get twice the number of recommendations (more than needed)

    # Get the movies from the index
    kNN_movies = train_data.iloc[kNN_indices]

    # Remove the movies the user has already watched
    kNN_movies_filtered = kNN_movies[~kNN_movies['movieId'].isin(user_watched_movies)]

    # If we have fewer than k movies after filtering, fetch more from the remaining pool
    if len(kNN_movies_filtered) < k:
        # Find additional movies (not already recommended) by getting the remaining top recommendations
        remaining_movies = kNN_movies[~kNN_movies['movieId'].isin(kNN_movies_filtered['movieId'])]
        additional_movies_needed = k - len(kNN_movies_filtered)
        additional_movies = remaining_movies.head(additional_movies_needed)
        
        # Combine the filtered list and additional recommendations
        kNN_movies_filtered = pd.concat([kNN_movies_filtered, additional_movies])

    return kNN_movies_filtered.head(k)  # Ensure we return exactly k recommendations
    # return kNN_movies

In [9]:
#if you want to compare cosine to euclidean distance
def knn_recommendation_eucl(query,train_data,k=5):
    #Extract ratings and feature vectors from training set
    #Ratings are optional if you don't want them, I have them here to use as bias
    train_features = np.array(train_data['feature_vector'].tolist())
    train_ratings = train_data['average_rating'].values#Optional

    #kNN using euclidean distances
    distances = euclidean_distances([query],train_features).flatten()

    #Optional Bias
    weighted_distances= distances / train_ratings

    #get the indicies of nearest movies (5)
    kNN_indices = np.argsort(weighted_distances)[:k] #Sorted by weighted distances ascending order

    #Get the movies from the index
    kNN_movies = train_data.iloc[kNN_indices]
    return kNN_movies

In [None]:
def test_knn(test_data,train_data,user_watched_movies, k=5):
    # Dictionary to store movie frequencies
    movie_frequency = {}

    for _,row in test_data.iterrows():
        query = np.array(row['feature_vector'])
        knn_movies = knn_recommendation_cos(query,train_data,user_watched_movies,k)
        # print(f"Current Movie searched: {row['title']}")
        # print("Recommended Movies: ")
        # print(knn_movies[['title','average_rating']])
        # print()

        # Update the frequency dictionary
        for _, movie_row in knn_movies.iterrows():
            movie_title = movie_row['title']
            movie_id = movie_row['movieId']
            if movie_title in movie_frequency:
                movie_frequency[movie_title][0] += 1
            else:
                movie_frequency[movie_title] = [1, movie_id]

    # Sort the movie_frequency dictionary by frequency (the first element of the list)
    sorted_dict = dict(sorted(movie_frequency.items(), key=lambda item: item[1][0], reverse=True))

    # Get the movie with the maximum frequency
    max_key = max(movie_frequency, key=lambda k: movie_frequency[k][0])  # Get the movie with the highest frequency
    max_value = movie_frequency[max_key]  # The value is a list: [frequency, movie_id]

    print(f"Maximum value: {max_value[0]} with Key: {max_key} and Movie ID: {max_value[1]}")

    # Print the top 5 recommended movies
    count = 0
    movieIds = []
    for movie, freq in sorted_dict.items():
        print(f"{movie}: Frequency = {freq[0]}, Movie ID = {freq[1]}")  # Display frequency and Movie ID
        movieIds.append(freq[1])
        if count == 5:
            break
        count += 1
        

    
    return movieIds

In [None]:
def calculate_cosine_similarity(movie_id_1, movie_id_2, train_data):
    """
    Calculate the cosine similarity between two movies based on their feature vectors.

    """

    # Retrieve the feature vectors for the two movies
    movie_1_vector = train_data.loc[train_data['movieId'] == movie_id_1, 'feature_vector'].values[0]
    movie_2_vector = train_data.loc[train_data['movieId'] == movie_id_2, 'feature_vector'].values[0]

    # Reshape the vectors to 2D arrays (required for cosine_similarity)
    movie_1_vector = movie_1_vector.reshape(1, -1)
    movie_2_vector = movie_2_vector.reshape(1, -1)

    # Calculate cosine similarity
    similarity = cosine_similarity(movie_1_vector, movie_2_vector)[0][0]

    return similarity

In [None]:
def my_train_test_split(df, n=7):
    # Create empty lists to store train and test data
    train_list = []
    test_list = []

    # Group by 'userID' to process each user individually
    
    merged_df = pd.merge(df, filtered_data, on='movieId')
    grouped = merged_df.groupby('userId')

    for user, group in grouped:
        # Sort movies by rating in descending order
        sorted_group = group.sort_values(by='rating', ascending=False)
        
        # Select the top 3 rated movies for testing
        test = sorted_group.head(n)
        
        # Use the rest for training
        train = sorted_group.iloc[n:]
        
        # Append to respective lists
        test_list.append(test)
        train_list.append(train)

    # Combine all train and test splits into DataFrames
    train_data = pd.concat(train_list).reset_index(drop=True)
    test_data = pd.concat(test_list).reset_index(drop=True)

    print("Training Data:")
    print(train_data)
    print("\nTesting Data:")
    print(test_data)
    print(type(train_data))

    return train_data, test_data

In [232]:
train_data, test_data = my_train_test_split(rate, n=0)

Training Data:
       userId  movieId  rating   timestamp  \
0           1     1240     5.0   964983723   
1           1     2139     5.0   964982791   
2           1     2115     5.0   964982529   
3           1     2078     5.0   964982838   
4           1     2058     5.0   964982400   
...       ...      ...     ...         ...   
48282     610    96861     2.0  1493850474   
48283     610    69526     2.0  1493846153   
48284     610     6541     1.5  1493845480   
48285     610   120635     1.0  1493850489   
48286     610    68319     1.0  1493845505   

                                                   title  average_rating  \
0                                 Terminator, The (1984)             3.9   
1                             Secret of NIMH, The (1982)             3.5   
2            Indiana Jones and the Temple of Doom (1984)             3.6   
3                                Jungle Book, The (1967)             3.8   
4                                 Negotiator, The (1

In [268]:
# Filter the dataframe for that specific user
# print(type(train_data))
# print(train_data)
user_data = train_data[train_data['userId'] == 11]
user_data1 = test_data[test_data['userId'] == 11]
# print(user_data)
print(user_data1)

# test_knn()

Empty DataFrame
Columns: [userId, movieId, rating, timestamp, title, average_rating, feature_vector]
Index: []


In [269]:
user_watched_movies = user_data['movieId'].to_numpy()
# print(user_watched_movies)
movieIDs = test_knn(user_data, filtered_data, user_watched_movies, k=5)

Maximum value: 5 with Key: Matrix, The (1999) and Movie ID: 2571
Matrix, The (1999): Frequency = 5, Movie ID = 2571
Blade Runner (1982): Frequency = 4, Movie ID = 541
Star Trek: First Contact (1996): Frequency = 4, Movie ID = 1356
North by Northwest (1959): Frequency = 4, Movie ID = 908
Woman Under the Influence, A (1974): Frequency = 3, Movie ID = 7071
Two Family House (2000): Frequency = 3, Movie ID = 3951


In [270]:
user_test_movies = user_data['movieId'].to_numpy()

arr = []

for x in movieIDs:
    for y in user_test_movies:
        z = calculate_cosine_similarity(x, y, filtered_data)
        arr.append(z)

    print(f"Recommended movie: {x} Similarity:{np.mean(arr)}")

Recommended movie: 2571 Similarity:0.4040010396691586
Recommended movie: 541 Similarity:0.4048810815297864
Recommended movie: 1356 Similarity:0.41319433638300096
Recommended movie: 908 Similarity:0.4180905888616713
Recommended movie: 7071 Similarity:0.3902671511958666
Recommended movie: 3951 Similarity:0.3717181927519968
