In [2]:
import numpy as np 
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from collections import Counter


In [3]:
df = pd.read_csv('new.csv')

In [4]:
df = df[['userId', 'movieId', 'rating', 'tags']]

In [5]:
# Step 1: Aggregate ratings for duplicate user-movie pairs
df_agg = df.groupby(['userId', 'movieId'])['rating'].mean().reset_index()

# Step 2: Create the user-item matrix
user_item_matrix = df_agg.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Step 3: Compute the item-item similarity matrix
item_similarity = cosine_similarity(user_item_matrix.T)

In [6]:
# Step 4: Create a KNN model for item-item collaborative filtering
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(user_item_matrix.T)

In [7]:
def get_knn_recommendations(movie_id, knn_model, user_item_matrix, n_neighbors=20):
    if movie_id not in user_item_matrix.columns:
        return []
    movie_index = user_item_matrix.columns.get_loc(movie_id)
    distances, indices = knn_model.kneighbors(user_item_matrix.T.iloc[movie_index, :].values.reshape(1, -1), n_neighbors=n_neighbors+1)
    recommended_movie_indices = indices.flatten()[1:]
    recommended_movie_ids = user_item_matrix.columns[recommended_movie_indices]
    return recommended_movie_ids

In [8]:
from sklearn.metrics import precision_score, recall_score
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score, classification_report
import time

def evaluate_recommendations(df, knn_model, user_item_matrix, n_neighbors=30):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    test_user_movie_pairs = test_df[['userId', 'movieId', 'rating']].values
    
    # Cache recommendations for each movie in training data
    user_recommendations = {}
    for user_id in train_df['userId'].unique():

        print(user_id)
        start_time = time.time()     
        # print(user_id)
        user_movies = train_df[train_df['userId'] == user_id]['movieId'].unique()
        user_recommendations[user_id] = []
        for user_movie_id in user_movies:
            recommendations = get_knn_recommendations(user_movie_id, knn_model, user_item_matrix, n_neighbors=n_neighbors)
            user_recommendations[user_id].extend(recommendations)

        # Count the occurrences of each recommended movie
        recommended_counter = Counter(user_recommendations[user_id])
        # Sort movies by the number of occurrences
        sorted_recommendations = [movie for movie, count in recommended_counter.most_common()]
        # Limit to top-N recommendations
        user_recommendations[user_id] = set(sorted_recommendations[:n_neighbors])
        end_time = time.time()
        elapsed_time = end_time - start_time

        print("Time elapsed:", elapsed_time, "seconds")
    
    y_true = []
    y_pred = []
    
    for user_id, movie_id, rating in test_user_movie_pairs:
        # print(user_id,movie_id)
        if user_id not in user_recommendations:
            continue
        
        top_recommended_movies = user_recommendations[user_id]
        
        # Use actual rating to determine relevance
        y_true.append(1 if rating >= 3.5 else 0)
        y_pred.append(1 if movie_id in top_recommended_movies else 0)
    
    # print(y_true)
    # print(y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    classification_rep = classification_report(y_true, y_pred, zero_division=0)
    return precision, recall, accuracy, f1, classification_rep, test_user_movie_pairs, user_recommendations



In [9]:
import multiprocessing as mp

def get_user_recommendataions(args):
    user_id, train_df, knn_model, user_item_matrix, n_neighbors = args
    user_recommendations = []
    user_movies = train_df[train_df['userId'] == user_id]['movieId'].unique()
    for user_movie_id in user_movies:
        recommendations = get_knn_recommendations(user_movie_id, knn_model, user_item_matrix, n_neighbors=n_neighbors)
        user_recommendations.extend(recommendations)
    recommended_counter = Counter(user_recommendations)
    sorted_recommendations = [movie for movie, count in recommended_counter.most_common()]
    return user_id, set(sorted_recommendations[:n_neighbors])

def evaluate_recommendations(df, knn_model, user_item_matrix, n_neighbors=30, n_processes=mp.cpu_count()):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    test_user_movie_pairs = test_df[['userId', 'movieId', 'rating']].values

    pool = mp.Pool(processes=n_processes)
    args = [(user_id, train_df, knn_model, user_item_matrix, n_neighbors) for user_id in train_df['userId'].unique()]
    user_recommendations = dict(pool.map(get_user_recommendations, args))
    pool.close()
    pool.join()

    y_true = []
    y_pred = []
    for user_id, movie_id, rating in test_user_movie_pairs:
        if user_id not in user_recommendations:
            continue
        top_recommended_movies = user_recommendations[user_id]
        y_true.append(1 if rating >= 3.5 else 0)
        y_pred.append(1 if movie_id in top_recommended_movies else 0)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    classification_rep = classification_report(y_true, y_pred, zero_division=0)

    return precision, recall, accuracy, f1, classification_rep, test_user_movie_pairs, user_recommendations

In [10]:
# Assuming df is your DataFrame and knn_model and user_item_matrix are already defined
precision, recall, accuracy, f1, classification_rep, test_user_movie_pairs, user_recommendations = evaluate_recommendations(df, knn, user_item_matrix, n_neighbors=150)
print(f"Recommendation system precision: {precision:.4f}")
print(f"Recommendation system recall: {recall:.4f}")
print(f"Recommendation system Accuracy: {accuracy:.4f}")
print(f"Recommendation system F1: {f1:.4f}")

print(f"Recommendation system classification_rep: \n", classification_rep)


Recommendation system precision: 0.7249
Recommendation system recall: 0.4363
Recommendation system Accuracy: 0.5497
Recommendation system F1: 0.5447
Recommendation system classification_rep: 
               precision    recall  f1-score   support

           0       0.45      0.73      0.55      7637
           1       0.72      0.44      0.54     12327

    accuracy                           0.55     19964
   macro avg       0.59      0.58      0.55     19964
weighted avg       0.62      0.55      0.55     19964



In [25]:
y_true = []
y_pred = []
for user_id, movie_id, rating in test_user_movie_pairs:
    # print(user_id,movie_id)
    if user_id not in user_recommendations:
        continue

    top_recommended_movies = user_recommendations[user_id]

    # Use actual rating to determine relevance
    y_true.append(1 if rating >= 3.5 else 0)
    y_pred.append(1 if movie_id in top_recommended_movies else 0)

# print(y_true)
# print(y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, zero_division=0)
classification_rep = classification_report(y_true, y_pred, zero_division=0)

print(f"Recommendation system precision: {precision:.4f}")
print(f"Recommendation system recall: {recall:.4f}")
print(f"Recommendation system Accuracy: {accuracy:.4f}")
print(f"Recommendation system F1: {f1:.4f}")

print(f"Recommendation system classification_rep: \n", classification_rep)

Recommendation system precision: 0.7249
Recommendation system recall: 0.4363
Recommendation system Accuracy: 0.5497
Recommendation system F1: 0.5447
Recommendation system classification_rep: 
               precision    recall  f1-score   support

           0       0.45      0.73      0.55      7637
           1       0.72      0.44      0.54     12327

    accuracy                           0.55     19964
   macro avg       0.59      0.58      0.55     19964
weighted avg       0.62      0.55      0.55     19964



In [13]:
import pickle


In [20]:
# Store user_recommendations variable as a file
with open('user_recommendations_150neighbours.pkl', 'wb') as file:
    pickle.dump(user_recommendations, file)

# Store test_user_movie_pairs variable as a file
with open('test_user_movie_pairs.pkl', 'wb') as file:
    pickle.dump(test_user_movie_pairs, file)