In [1]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np

In [12]:
#Block One: First Filtering
#reading the CSV files, change them on your personal computer to wherever you saved them
rate = pd.read_csv(r'ml-latest/ratings.csv')
movies = pd.read_csv(r'ml-latest/movies.csv')
ratings = pd.read_csv(r'ml-latest/ratings.csv')
tags = pd.read_csv(r'ml-latest/tags.csv')
userID = ratings
#dropping timestamp as it's unnecessary
tags = tags.drop(columns='timestamp')
ratings = ratings.drop(columns='timestamp') 

#Groups on userId and movieId and groups tags into a list
tags = tags.groupby(['movieId']).agg({'tag':set}).reset_index()

#Refining ratings to just be movieId and the average rating to only 1 decimal point
ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
ratings = ratings.rename(columns={'rating':'average_rating'})
ratings['average_rating'] = ratings['average_rating'].round(1)
filtered_data = pd.merge(movies,ratings, on='movieId')
filtered_data = pd.merge(filtered_data, tags, on='movieId')

In [13]:
#Block Two: Unique Genres and Second Filtering
# Split genres by '|' and get all unique genres
unique_genres = set()
for genres in movies['genres']:
    unique_genres.update(genres.split('|'))
unique_genres = sorted(unique_genres)  # Sort in alphabetic order
print(len(unique_genres))
for genre in unique_genres:
    filtered_data[genre] = filtered_data['genres'].apply(lambda x: 1 if genre in x.split('|') else 0)
#filter out In Netflix Queue
filtered_data['tag'] = filtered_data['tag'].apply(lambda tag_set: {tag for tag in tag_set if tag != "In Netflix queue"})
# print(filtered_data.head())

# Drop the original 'genres' column (I think it's a good idea, no need to clutter the data yk?)
filtered_data= filtered_data.drop(columns=['genres'])

20


In [14]:
#Block Three: Vectorization
#I changed the vector size to 10 just cause it's smaller but if you wanna increase it by all means go for it
wordVector = Word2Vec(filtered_data['tag'].tolist(),vector_size=10, window=5, min_count=1, workers=4)

vector_size = 10  # Same size as Word2Vec vectors
padding_vector = np.zeros(vector_size)
# Function to generate a vector for up to 3 tags
def create_feature_vector(row):
    # Get word vectors for the tags, up to 3 tags, and pad if fewer
    tag_vectors = [
        wordVector.wv[tag] if tag in wordVector.wv else padding_vector
        for tag in list(row['tag'])[:3] # inlude if needed
    ]
    while len(tag_vectors) < 3:  # Pad with zero vectors if fewer than 3 tags
        tag_vectors.append(padding_vector)
    
    # Flatten the tag vectors (3 vectors of size 10 each -> 30 elements)
    tag_vector = np.concatenate(tag_vectors)
    
    # Add genre one-hot encoding
    genre_vector = row[unique_genres].values  # One-hot encoded genres
    
    # Combine tag vector and genre vector
    feature_vector = np.concatenate([tag_vector, genre_vector])
    return feature_vector

#wrote to file to see feature data
# filtered_data.to_csv('filtered_movies.csv',index=False)

In [17]:
movies_df = pd.read_csv('ml-latest/movies.csv')
tags_df = pd.read_csv('ml-latest/tags.csv')
tags_set = tags_df.groupby(['movieId']).agg({'tag':set}).reset_index()

"""      Genres_vector     """
# Initialize the genre_vector list
genre_vector = []

# Iterate over each movie's genres
for _, row in movies_df.iterrows():
    # Create a one-hot vector for the current movie
    genres = row['genres'].split('|')
    vector = [1 if genre in genres else 0 for genre in unique_genres]
    genre_vector.append(vector)

# Convert to a numpy array for further use
genre_vector = np.array(genre_vector)

# Add the genre_vector to the dataframe for reference (optional)
movies_df['genre_vector'] = list(genre_vector)
print(f"\n ------This is genre-vec len: {len(movies_df['genre_vector'][0])} --------- \n")



 ------This is genre-vec len: 20 --------- 



In [18]:
"""       Tags_vector      """
# Merge tags with movies, ensuring all movies are included
movies_df = movies_df.merge(tags_set, how='left', on='movieId')

# Replace NaN tags with an empty set
movies_df['tag'] = movies_df['tag'].apply(lambda x: x if isinstance(x, set) else set())

# Check unique data types in 'tag' for debugging (optional)
# print(movies_df['tag'].apply(type).value_counts())

# Preprocess tags: Split, clean, and ensure consistency
def preprocess_tags(tag_set):
    """
    Process a set of tags by stripping and converting them to lowercase.
    Handles empty sets gracefully.
    """
    if isinstance(tag_set, set):
        return [str(tag).strip().lower() for tag in tag_set if isinstance(tag, str)]
    return []

movies_df['tag_list'] = movies_df['tag'].apply(preprocess_tags)

# 3. Train word2vec model
# Combine all tag lists into one list of lists
tag_corpus = movies_df['tag_list'].tolist()

# Train a word2vec model
word2vec_model = Word2Vec(tag_corpus, vector_size=20, window=5, min_count=1, workers=4)

# 4. Create movie vectors by averaging tag vectors
def get_movie_vector(tags):
    tag_vectors = [word2vec_model.wv[tag] for tag in tags if tag in word2vec_model.wv]
    return np.mean(tag_vectors, axis=0) if tag_vectors else np.zeros(word2vec_model.vector_size)

movies_df['tag_vector'] = movies_df['tag_list'].apply(get_movie_vector)
print(f"\n ------This is movie-vec len: {len(movies_df['tag_vector'][0])} --------- \n")

# 5. Drop unnecessary columns if needed
# movies_df = movies_df.drop(columns=['tags', 'tag_list'])

print(movies_df.head())


 ------This is movie-vec len: 20 --------- 

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  \
0  Adventure|Animation|Children|Comedy|Fantasy   
1                   Adventure|Children|Fantasy   
2                               Comedy|Romance   
3                         Comedy|Drama|Romance   
4                                       Comedy   

                                        genre_vector  \
0  [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...   
1  [0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...   
2  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3  [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...   
4  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   

               

In [20]:
filtered_data_copy = filtered_data.copy()

# What we used before
# filtered_data['feature_vector'] = filtered_data.apply(create_feature_vector, axis=1)

# What we are using now
movies_df['feature_vector'] = movies_df.apply(
    lambda row: np.concatenate([row['tag_vector'], row['genre_vector']]),
    axis=1
)

filtered_data1 = filtered_data_copy.merge(
    movies_df[['movieId', 'feature_vector']],  # Select only necessary columns
    on='movieId',  # Merge on the 'movieId' column
    how='left'  # Use a left join to keep all rows in filtered_data_df
)

#Okay so I mixed the tag vector and the genre vectors into one feature vector
#I am going to drop the original tag vector and the genres
filtered_data = filtered_data.drop(columns=list(unique_genres))
filtered_data = filtered_data.drop(columns='tag')

filtered_data1 = filtered_data1.drop(columns=list(unique_genres))
filtered_data1 = filtered_data1.drop(columns='tag')

print(len(filtered_data1['feature_vector'][6]))
print(filtered_data1.head())


40
   movieId                               title  average_rating  \
0        1                    Toy Story (1995)             3.9   
1        2                      Jumanji (1995)             3.3   
2        3             Grumpier Old Men (1995)             3.2   
3        4            Waiting to Exhale (1995)             2.9   
4        5  Father of the Bride Part II (1995)             3.1   

                                      feature_vector  
0  [-0.04382392764091492, -0.10386305302381516, -...  
1  [-0.027821924537420273, -0.17097659409046173, ...  
2  [-0.29177185893058777, 0.08404678851366043, 0....  
3  [-1.1505054235458374, 0.8926494717597961, -0.0...  
4  [-0.4348922371864319, 0.49144214391708374, -0....  


In [22]:
#Query is the movie we want to isolate
#Cosine Similarities
def knn_recommendation_cos(query,train_data, user_watched_movies, k=5):
    #Extract ratings and feature vectors from training set
    #Ratings are optional if you don't want them, I have them here to use as bias
    train_features = np.array(train_data['feature_vector'].tolist())
    train_ratings = train_data['average_rating'].values#Optional

    #kNN using cosine_similarity
    similarities = cosine_similarity([query],train_features).flatten()

    #Optional Bias
    weighted_similarities = similarities * train_ratings

    # #get the indicies of nearest movies (5)
    # kNN_indices = np.argsort(weighted_similarities)[-k:][::-1] #Sorted by weighted similariy in descending order

    # #Get the movies from the index
    # kNN_movies = train_data.iloc[kNN_indices]
        # Get the indices of nearest movies (more than k initially)
    kNN_indices = np.argsort(weighted_similarities)[-k*2:][::-1]  # Get twice the number of recommendations (more than needed)

    # Get the movies from the index
    kNN_movies = train_data.iloc[kNN_indices]

    # Remove the movies the user has already watched
    kNN_movies_filtered = kNN_movies[~kNN_movies['movieId'].isin(user_watched_movies)]

    # If we have fewer than k movies after filtering, fetch more from the remaining pool
    if len(kNN_movies_filtered) < k:
        # Find additional movies (not already recommended) by getting the remaining top recommendations
        remaining_movies = kNN_movies[~kNN_movies['movieId'].isin(kNN_movies_filtered['movieId'])]
        additional_movies_needed = k - len(kNN_movies_filtered)
        additional_movies = remaining_movies.head(additional_movies_needed)
        
        # Combine the filtered list and additional recommendations
        kNN_movies_filtered = pd.concat([kNN_movies_filtered, additional_movies])

    return kNN_movies_filtered.head(k)  # Ensure we return exactly k recommendations
    # return kNN_movies

In [23]:
#if you want to compare cosine to euclidean distance
def knn_recommendation_eucl(query,train_data,k=5):
    #Extract ratings and feature vectors from training set
    #Ratings are optional if you don't want them, I have them here to use as bias
    train_features = np.array(train_data['feature_vector'].tolist())
    train_ratings = train_data['average_rating'].values#Optional

    #kNN using euclidean distances
    distances = euclidean_distances([query],train_features).flatten()

    #Optional Bias
    weighted_distances= distances / train_ratings

    #get the indicies of nearest movies (5)
    kNN_indices = np.argsort(weighted_distances)[:k] #Sorted by weighted distances ascending order

    #Get the movies from the index
    kNN_movies = train_data.iloc[kNN_indices]
    return kNN_movies

In [24]:
def test_knn(test_data,train_data,user_watched_movies, k=5):
    # Dictionary to store movie frequencies
    movie_frequency = {}

    for _,row in test_data.iterrows():
        query = np.array(row['feature_vector'])
        knn_movies = knn_recommendation_cos(query,train_data,user_watched_movies,k)
        # print(f"Current Movie searched: {row['title']}")
        # print("Recommended Movies: ")
        # print(knn_movies[['title','average_rating']])
        # print()

        # Update the frequency dictionary
        for _, movie_row in knn_movies.iterrows():
            movie_title = movie_row['title']
            movie_id = movie_row['movieId']
            if movie_title in movie_frequency:
                movie_frequency[movie_title][0] += 1
            else:
                movie_frequency[movie_title] = [1, movie_id]

    # Sort the movie_frequency dictionary by frequency (the first element of the list)
    sorted_dict = dict(sorted(movie_frequency.items(), key=lambda item: item[1][0], reverse=True))

    # Get the movie with the maximum frequency
    max_key = max(movie_frequency, key=lambda k: movie_frequency[k][0])  # Get the movie with the highest frequency
    max_value = movie_frequency[max_key]  # The value is a list: [frequency, movie_id]

    print(f"Maximum value: {max_value[0]} with Key: {max_key} and Movie ID: {max_value[1]}")

    # Print the top 5 recommended movies
    count = 0
    movieIds = []
    for movie, freq in sorted_dict.items():
        print(f"{movie}: Frequency = {freq[0]}, Movie ID = {freq[1]}")  # Display frequency and Movie ID
        movieIds.append(freq[1])
        if count == 5:
            break
        count += 1
        

    
    return movieIds

In [25]:
def calculate_cosine_similarity(movie_id_1, movie_id_2, train_data):
    """
    Calculate the cosine similarity between two movies based on their feature vectors.

    """

    # Retrieve the feature vectors for the two movies
    movie_1_vector = train_data.loc[train_data['movieId'] == movie_id_1, 'feature_vector'].values[0]
    # print(movie_1_vector)
    movie_2_vector = train_data.loc[train_data['movieId'] == movie_id_2, 'feature_vector'].values[0]

    # Reshape the vectors to 2D arrays (required for cosine_similarity)
    movie_1_vector = movie_1_vector.reshape(1, -1)
    movie_2_vector = movie_2_vector.reshape(1, -1)

    # Calculate cosine similarity
    similarity = cosine_similarity(movie_1_vector, movie_2_vector)[0][0]

    return similarity

In [26]:
def my_train_test_split(df, filt_data, n=7):
    # Create empty lists to store train and test data
    train_list = []
    test_list = []

    # Group by 'userID' to process each user individually
    
    merged_df = pd.merge(df, filt_data, on='movieId')
    grouped = merged_df.groupby('userId')

    for user, group in grouped:
        # Sort movies by rating in descending order
        sorted_group = group.sort_values(by='rating', ascending=False)
        
        # Select the top n rated movies for testing
        test = sorted_group.head(n)
        
        # Use the rest for training
        train = sorted_group.iloc[n:]
        
        # Append to respective lists
        test_list.append(test)
        train_list.append(train)

    # Combine all train and test splits into DataFrames
    train_data = pd.concat(train_list).reset_index(drop=True)
    test_data = pd.concat(test_list).reset_index(drop=True)

    print("Training Data:")
    print(train_data)
    print("\nTesting Data:")
    print(test_data)
    print(type(train_data))

    return train_data, test_data

In [34]:
# train_data, test_data = my_train_test_split(rate, filtered_data,n=0)
train_data1, test_data1 = my_train_test_split(rate, filtered_data1,n=5)

# print(f"traing_data query{len(train_data['feature_vector'][0])}")

Training Data:
          userId  movieId  rating   timestamp  \
0              1     4995     5.0  1225734583   
1              1     7153     5.0  1225735149   
2              1     1291     5.0  1225734809   
3              1     8533     5.0  1225737239   
4              1     3578     5.0  1225735309   
...          ...      ...     ...         ...   
32099713  330975      160     0.5  1091583193   
32099714  330975     2792     0.5  1091582197   
32099715  330975     2953     0.5  1091582192   
32099716  330975     1681     0.5  1091582912   
32099717  330975      688     0.5  1091582787   

                                                      title  average_rating  \
0                                  Beautiful Mind, A (2001)             4.0   
1         Lord of the Rings: The Return of the King, The...             4.1   
2                 Indiana Jones and the Last Crusade (1989)             4.0   
3                                      Notebook, The (2004)             3.7   
4

In [35]:
# Filter the dataframe for that specific user
# print(type(train_data))
# print(train_data)
user_id = 10
# user_data = train_data[train_data['userId'] == user_id]
user_data1 = train_data1[train_data1['userId'] == user_id]
# user_data1 = test_data[test_data['userId'] == 2]
# print(user_data)
# print(user_data1)

# test_knn()

In [36]:
user_watched_movies = user_data1['movieId'].to_numpy()
# print(user_watched_movies)
# you can use either filtered_data1(new vect) or filtered_data(old vect)
movieIDs1 = test_knn(user_data1, filtered_data1, user_watched_movies, k=5)

print("\n ---------- Next old Filter --------- \n")

# movieIDs = test_knn(user_data, filtered_data, user_watched_movies, k=5)

Maximum value: 70 with Key: In Her Line of Fire (2006) and Movie ID: 159053
In Her Line of Fire (2006): Frequency = 70, Movie ID = 159053
WWE: The Triumph and Tragedy of World Class Championship Wrestling (2007): Frequency = 69, Movie ID = 270306
Beyond Words (2018): Frequency = 56, Movie ID = 275847
The Blair Thumb (2002): Frequency = 54, Movie ID = 176719
Hercules vs. the Giant Warriors (1964): Frequency = 45, Movie ID = 148126
Churuli (2021): Frequency = 43, Movie ID = 280184

 ---------- Next old Filter --------- 



In [None]:
user_test_movies = user_data1['movieId'].to_numpy()
print(len(user_test_movies))
arr = []

for x in movieIDs1:
    arr = []
    for y in user_test_movies:
        z = calculate_cosine_similarity(x, y, filtered_data1)
        arr.append(z)

    print(f"Recommended movie: {filtered_data1.loc[filtered_data1['movieId'] == x].iloc[0]['title']}: ID({x}), Similarity:{np.mean(arr)}")

157
Recommended movie: In Her Line of Fire (2006): ID(159053), Similarity:0.843565092856572
Recommended movie: WWE: The Triumph and Tragedy of World Class Championship Wrestling (2007): ID(270306), Similarity:0.8452535784023585
Recommended movie: Beyond Words (2018): ID(275847), Similarity:0.8459261775768931
Recommended movie: The Blair Thumb (2002): ID(176719), Similarity:0.8471252797125722
Recommended movie: Hercules vs. the Giant Warriors (1964): ID(148126), Similarity:0.8382088325348092
Recommended movie: Churuli (2021): ID(280184), Similarity:0.837247686966817


In [None]:
user_test_movies = user_data['movieId'].to_numpy()
print(len(user_test_movies))
arr = []

for x in movieIDs:
    for y in user_test_movies:
        z = calculate_cosine_similarity(x, y, filtered_data)
        arr.append(z)

    print(f"Recommended movie: {x} Similarity:{np.mean(arr)}")

47
Recommended movie: 6201 Similarity:0.3923577293777858
Recommended movie: 7023 Similarity:0.4350625505718232
Recommended movie: 898 Similarity:0.4487964942409145
Recommended movie: 1235 Similarity:0.45537037418930904
Recommended movie: 1277 Similarity:0.45984977370461944
Recommended movie: 5404 Similarity:0.44869870955087665


In [78]:
user_data2 = test_data1[test_data1['userId'] == 10]['movieId'].to_numpy()
arr = []

for x in movieIDs1:
    arr = []
    for y in user_data2:
        z = calculate_cosine_similarity(x, y, filtered_data1)
        arr.append(z)

    print(f"Recommended movie: {filtered_data1.loc[filtered_data1['movieId'] == x].iloc[0]['title']}: ID({x}), Similarity:{np.mean(arr)}")

print(f"unrelatedMovie ID 589: {calculate_cosine_similarity(273,258, filtered_data1)}")

Recommended movie: In Her Line of Fire (2006): ID(159053), Similarity:0.8752660259925227
Recommended movie: WWE: The Triumph and Tragedy of World Class Championship Wrestling (2007): ID(270306), Similarity:0.8727028158096571
Recommended movie: Beyond Words (2018): ID(275847), Similarity:0.8449013872839813
Recommended movie: The Blair Thumb (2002): ID(176719), Similarity:0.824579934833095
Recommended movie: Hercules vs. the Giant Warriors (1964): ID(148126), Similarity:0.755566746481805
Recommended movie: Churuli (2021): ID(280184), Similarity:0.8450899589814576
unrelatedMovie ID 589: 0.6844338876287877
