In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm

In [2]:
df = pd.read_csv('dataset.csv')
movie_similarity_based_on_users_rating = pd.read_csv('movie_similarity_based_on_users_rating.csv', index_col=0)
movie_similarity_based_on_objects = pd.read_csv('movie_similarity_based_on_objects.csv', index_col=0)


In [3]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81730 entries, 0 to 81729
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     81730 non-null  int64  
 1   movieId    81730 non-null  int64  
 2   rating     81730 non-null  float64
 3   timestamp  81730 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 2.5 MB


In [5]:
movie_similarity = movie_similarity_based_on_users_rating

In [6]:
# Ensure movie IDs in similarity matrix are integers
movie_similarity.index = movie_similarity.index.astype(int)
movie_similarity.columns = movie_similarity.columns.astype(int)

In [7]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)


In [59]:
def predict_rating(user_id, movie_id, train_data, similarity_matrix, k=10):
    # Get the movies rated by the user
    user_ratings = train_data[train_data['userId'] == user_id]
    
    # If the user hasn't rated any movies, return a default rating (e.g., the global mean)
    if user_ratings.empty:
        return train_data['rating'].mean()

    # Check if the movie_id is in the similarity matrix
    if movie_id not in similarity_matrix.index:
        print(f"Movie ID {movie_id} not found in similarity matrix. Returning global mean.")
        return train_data['rating'].mean()
    
    # Get the similarity scores for the target movie with all other movies
    similar_movies = similarity_matrix[movie_id].dropna()
    
    # Merge with the user's ratings
    similar_movies = similar_movies.reset_index()
    similar_movies.columns = ['movieId', 'similarity']
    user_ratings = user_ratings.merge(similar_movies, on='movieId')
    
    if user_ratings.empty:
        print(f"No similar movies found for Movie ID {movie_id}. Returning global mean.")
        return train_data['rating'].mean()

    # Get the top k similar movies
    user_ratings = user_ratings.sort_values(by='similarity', ascending=False)
    user_ratings = user_ratings.head(k)
    
    # Calculate the weighted average of the ratings
    weighted_sum = sum(user_ratings['rating'] * user_ratings['similarity'])
    sum_of_weights = sum(user_ratings['similarity'])
    if sum_of_weights == 0:
        return train_data['rating'].mean()
    
    predicted_rating = weighted_sum / sum_of_weights
    return predicted_rating

# Predict ratings for the test set
test_data['predicted_rating'] = test_data.apply(
    lambda row: predict_rating(row['userId'], row['movieId'], train_data, movie_similarity), axis=1
)


In [60]:
# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(test_data['rating'], test_data['predicted_rating']))
print(f'RMSE: {rmse}')

# Assuming you have the actual and predicted ratings in test_data
mae = mean_absolute_error(test_data['rating'], test_data['predicted_rating'])
print(f'MAE: {mae}')



RMSE: 0.8825343817722014
MAE: 0.6675272384099467


In [50]:
movie_similarity = movie_similarity_based_on_objects

In [51]:
# Ensure movie IDs in similarity matrix are integers
movie_similarity.index = movie_similarity.index.astype(int)
movie_similarity.columns = movie_similarity.columns.astype(int)

In [52]:
def predict_rating(user_id, movie_id, train_data, similarity_matrix, k=10):
    # Get the movies rated by the user
    user_ratings = train_data[train_data['userId'] == user_id]
    
    # If the user hasn't rated any movies, return a default rating (e.g., the global mean)
    if user_ratings.empty:
        return train_data['rating'].mean()

    # Check if the movie_id is in the similarity matrix
    if movie_id not in similarity_matrix.index:
        print(f"Movie ID {movie_id} not found in similarity matrix. Returning global mean.")
        return train_data['rating'].mean()
    
    # Get the similarity scores for the target movie with all other movies
    similar_movies = similarity_matrix[movie_id].dropna()
    
    # Merge with the user's ratings
    similar_movies = similar_movies.reset_index()
    similar_movies.columns = ['movieId', 'similarity']
    user_ratings = user_ratings.merge(similar_movies, on='movieId')
    
    if user_ratings.empty:
        print(f"No similar movies found for Movie ID {movie_id}. Returning global mean.")
        return train_data['rating'].mean()

    # Get the top k similar movies
    user_ratings = user_ratings.sort_values(by='similarity', ascending=False)
    user_ratings = user_ratings.head(k)
    
    # Calculate the weighted average of the ratings
    weighted_sum = sum(user_ratings['rating'] * user_ratings['similarity'])
    sum_of_weights = sum(user_ratings['similarity'])
    if sum_of_weights == 0:
        return train_data['rating'].mean()
    
    predicted_rating = weighted_sum / sum_of_weights
    return predicted_rating

# Predict ratings for the test set
test_data['predicted_rating'] = test_data.apply(
    lambda row: predict_rating(row['userId'], row['movieId'], train_data, movie_similarity), axis=1
)


In [55]:
# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(test_data['rating'], test_data['predicted_rating']))
print(f'RMSE: {rmse}')

# Assuming you have the actual and predicted ratings in test_data
mae = mean_absolute_error(test_data['rating'], test_data['predicted_rating'])
print(f'MAE: {mae}')


RMSE: 1.1065603582559367
MAE: 0.7867073165288622


In [8]:
movie_similarity = movie_similarity_based_on_users_rating

In [9]:
# Ensure movie IDs in similarity matrix are integers
movie_similarity.index = movie_similarity.index.astype(int)
movie_similarity.columns = movie_similarity.columns.astype(int)

In [10]:
import pandas as pd
from tqdm import tqdm
import concurrent.futures

def precision_at_k(recommended_items, relevant_items, k):
    # Ensure only top K recommendations are considered
    recommended_items_at_k = recommended_items[:k]
    
    # Calculate the number of relevant items in the top K recommendations
    num_relevant_items = sum(1 for item in recommended_items_at_k if item in relevant_items)
    
    # Calculate precision@K
    precision = num_relevant_items / k
    return precision

def get_top_k_recommendations(user_id, user_train_data, similarity_matrix, k):
    user_rated_movies = user_train_data['movieId'].values
    movie_scores = {}

    for movie_id in user_rated_movies:
        if movie_id in similarity_matrix.index:
            similar_movies = similarity_matrix[movie_id].dropna()
            for similar_movie_id, similarity in similar_movies.items():
                if similar_movie_id not in user_rated_movies:
                    if similar_movie_id not in movie_scores:
                        movie_scores[similar_movie_id] = 0
                    movie_scores[similar_movie_id] += similarity
    
    # Sort the movies based on the scores and get the top K
    sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
    top_k_recommendations = [movie_id for movie_id, score in sorted_movies[:k]]
    return top_k_recommendations

def evaluate_precision_for_user(user_id, test_data, train_data, similarity_matrix, k):
    user_test_data = test_data[test_data['userId'] == user_id]
    user_relevant_items = set(user_test_data['movieId'])
    
    # Generate top K recommendations for this user based on the similarity matrix
    user_train_data = train_data[train_data['userId'] == user_id]
    recommended_items = get_top_k_recommendations(user_id, user_train_data, similarity_matrix, k)
    
    # Calculate precision@K for this user
    precision = precision_at_k(recommended_items, user_relevant_items, k)
    return precision

def evaluate_precision_at_k(test_data, train_data, similarity_matrix, k=10, num_workers=50):
    user_ids = test_data['userId'].unique()
    precision_scores = []

    with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(evaluate_precision_for_user, user_id, test_data, train_data, similarity_matrix, k): user_id for user_id in user_ids}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(user_ids), desc="Evaluating Precision@K"):
            try:
                precision_scores.append(future.result())
            except Exception as e:
                print(f"Error processing user {futures[future]}: {e}")
    
    # Calculate mean precision@K
    mean_precision_at_k = sum(precision_scores) / len(precision_scores)
    return mean_precision_at_k

# Example usage
precision_k = evaluate_precision_at_k(test_data, train_data, movie_similarity, k=10, num_workers=60)
print(f'Precision@10: {precision_k}')


Evaluating Precision@K: 100%|██████████| 668/668 [10:45<00:00,  1.03it/s]


Precision@10: 0.3742514970059872


In [17]:
movie_similarity = movie_similarity_based_on_objects

In [18]:
# Ensure movie IDs in similarity matrix are integers
movie_similarity.index = movie_similarity.index.astype(int)
movie_similarity.columns = movie_similarity.columns.astype(int)

In [19]:
import pandas as pd
from tqdm import tqdm
import concurrent.futures

def precision_at_k(recommended_items, relevant_items, k):
    # Ensure only top K recommendations are considered
    recommended_items_at_k = recommended_items[:k]
    
    # Calculate the number of relevant items in the top K recommendations
    num_relevant_items = sum(1 for item in recommended_items_at_k if item in relevant_items)
    
    # Calculate precision@K
    precision = num_relevant_items / k
    return precision

def get_top_k_recommendations(user_id, user_train_data, similarity_matrix, k):
    user_rated_movies = user_train_data['movieId'].values
    movie_scores = {}

    for movie_id in user_rated_movies:
        if movie_id in similarity_matrix.index:
            similar_movies = similarity_matrix[movie_id].dropna()
            for similar_movie_id, similarity in similar_movies.items():
                if similar_movie_id not in user_rated_movies:
                    if similar_movie_id not in movie_scores:
                        movie_scores[similar_movie_id] = 0
                    movie_scores[similar_movie_id] += similarity
    
    # Sort the movies based on the scores and get the top K
    sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
    top_k_recommendations = [movie_id for movie_id, score in sorted_movies[:k]]
    return top_k_recommendations

def evaluate_precision_for_user(user_id, test_data, train_data, similarity_matrix, k):
    user_test_data = test_data[test_data['userId'] == user_id]
    user_relevant_items = set(user_test_data['movieId'])
    
    # Generate top K recommendations for this user based on the similarity matrix
    user_train_data = train_data[train_data['userId'] == user_id]
    recommended_items = get_top_k_recommendations(user_id, user_train_data, similarity_matrix, k)
    
    # Calculate precision@K for this user
    precision = precision_at_k(recommended_items, user_relevant_items, k)
    return precision

def evaluate_precision_at_k(test_data, train_data, similarity_matrix, k=10, num_workers=50):
    user_ids = test_data['userId'].unique()
    precision_scores = []

    with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(evaluate_precision_for_user, user_id, test_data, train_data, similarity_matrix, k): user_id for user_id in user_ids}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(user_ids), desc="Evaluating Precision@K"):
            try:
                precision_scores.append(future.result())
            except Exception as e:
                print(f"Error processing user {futures[future]}: {e}")
    
    # Calculate mean precision@K
    mean_precision_at_k = sum(precision_scores) / len(precision_scores)
    return mean_precision_at_k

# Example usage
precision_k = evaluate_precision_at_k(test_data, train_data, movie_similarity, k=10, num_workers=60)
print(f'Precision@10: {precision_k}')


Evaluating Precision@K: 100%|██████████| 668/668 [12:56<00:00,  1.16s/it]


Precision@10: 0.005089820359281439


In [None]:
import sounddevice as sd

def play_sound():
    sample_rate = 44100  # Set the sample rate in Hz
    duration = 5  # Set the duration of the sound in seconds

    # Generate a simple sine wave signal
    t = np.linspace(0, duration, int(sample_rate * duration), False)
    frequency = 440  # Set the frequency of the sine wave in Hz
    wave = 0.5 * np.sin(2 * np.pi * frequency * t)

    sd.play(wave, sample_rate)
    sd.wait()

play_sound()