In [48]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [49]:
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None, 
                     names=['movie_id', 'title', 'release_date', 'video_release_date', 
                            'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation',
                            'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                            'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                            'Thriller', 'War', 'Western'])


In [60]:
user_item_matrix = ratings.pivot_table(index='user_id', columns='item_id', values='rating')
user_item_matrix = user_item_matrix.fillna(0)

In [70]:
def item_based_collaborative_filtering(user_id, N=10, use_matrix=None):
    matrix_to_use = use_matrix if use_matrix is not None else user_item_matrix
    
    if user_id not in matrix_to_use.index:
        return []
    
    user_ratings = matrix_to_use.loc[user_id]
    
    rated_movies = user_ratings[user_ratings > 0]
    
    if len(rated_movies) == 0:
        return []
    
    item_similarity = cosine_similarity(matrix_to_use.T)
    item_similarity_df = pd.DataFrame(item_similarity, 
                                      index=matrix_to_use.columns,
                                      columns=matrix_to_use.columns)
    
    unrated_movies = user_ratings[user_ratings == 0].index.tolist()
    
    if len(unrated_movies) == 0:
        return []
    
    predictions = []
    
    for movie in unrated_movies:
        movie_similarities = item_similarity_df.loc[movie, rated_movies.index]
        
        positive_sims = movie_similarities[movie_similarities > 0]
        
        if len(positive_sims) > 0:
            top_similar = positive_sims.nlargest(20)
            
            weighted_sum = sum(top_similar * rated_movies[top_similar.index])
            similarity_sum = top_similar.sum()
            
            if similarity_sum > 0:
                predicted_rating = weighted_sum / similarity_sum
                predictions.append((movie, predicted_rating))
    
    predictions.sort(key=lambda x: x[1], reverse=True)
    top_movies = [movie_id for movie_id, score in predictions[:N]]
    
    return top_movies

In [71]:
def matrix_factorization_svd(user_id, N=10, k=50, use_matrix=None):
    matrix_to_use = use_matrix if use_matrix is not None else user_item_matrix
    
    R = matrix_to_use.values
    
    user_ratings_mean = np.mean(R, axis=1)
    R_normalized = R - user_ratings_mean.reshape(-1, 1)
    
    U, sigma, Vt = svds(R_normalized, k=k)
    
    sigma = np.diag(sigma)
    
    predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    
    predictions_df = pd.DataFrame(predicted_ratings, 
                                  index=matrix_to_use.index,
                                  columns=matrix_to_use.columns)
    
    user_predictions = predictions_df.loc[user_id]
    
    user_rated = matrix_to_use.loc[user_id]
    user_predictions = user_predictions[user_rated == 0]
    
    top_movies = user_predictions.sort_values(ascending=False).head(N)
    return top_movies.index.tolist()

In [72]:
def recommend_movies(user_id, N=10, method='svd'):
    if user_id not in user_item_matrix.index:
        print(f"User {user_id} not found in the dataset")
        return []
    
    if method == 'item_based':
        movie_ids = item_based_collaborative_filtering(user_id, N)
    elif method == 'svd':
        movie_ids = matrix_factorization_svd(user_id, N)
    elif method == 'hybrid':
        movie_ids = hybrid_recommendation(user_id, N)
    else:
        movie_ids = matrix_factorization_svd(user_id, N)
    
    recommendations = []
    for movie_id in movie_ids:
        movie_title = movies[movies['movie_id'] == movie_id]['title'].values
        if len(movie_title) > 0:
            recommendations.append((movie_id, movie_title[0]))
    
    return recommendations

In [73]:
def prepare_train_test_split(test_size=0.2):
    ratings_sorted = ratings.sort_values('timestamp')
    split_idx = int(len(ratings_sorted) * (1 - test_size))
    
    train_data = ratings_sorted.iloc[:split_idx]
    test_data = ratings_sorted.iloc[split_idx:]
    
    return train_data, test_data

In [74]:
def precision_at_k(user_id, recommended_movies, test_data, k=10):
    relevant_movies = test_data[(test_data['user_id'] == user_id) & 
                                (test_data['rating'] >= 4)]['item_id'].tolist()
    
    if len(relevant_movies) == 0:
        return 0
    
    top_k = recommended_movies[:k]
    
    hits = len(set(top_k).intersection(set(relevant_movies)))
    
    return hits / k

In [75]:
def recall_at_k(user_id, recommended_movies, test_data, k=10):
    relevant_movies = test_data[(test_data['user_id'] == user_id) & 
                                (test_data['rating'] >= 4)]['item_id'].tolist()
    
    if len(relevant_movies) == 0:
        return 0
    
    top_k = recommended_movies[:k]
    
    hits = len(set(top_k).intersection(set(relevant_movies)))
    
    return hits / len(relevant_movies)


In [76]:
def ndcg_at_k(user_id, recommended_movies, test_data, k=10):
    user_test_ratings = test_data[test_data['user_id'] == user_id]
    
    if len(user_test_ratings) == 0:
        return 0
    
    top_k = recommended_movies[:k]
    
    dcg = 0
    for idx, movie_id in enumerate(top_k):
        rating = user_test_ratings[user_test_ratings['item_id'] == movie_id]['rating'].values
        if len(rating) > 0:
            rel = rating[0]
            dcg += (2**rel - 1) / np.log2(idx + 2)
    
    ideal_ratings = sorted(user_test_ratings['rating'].values, reverse=True)[:k]
    idcg = 0
    for idx, rating in enumerate(ideal_ratings):
        idcg += (2**rating - 1) / np.log2(idx + 2)
    
    if idcg == 0:
        return 0
    
    return dcg / idcg

In [None]:
def evaluate_model(method='svd', k=10, num_users=100):
    train_data, test_data = prepare_train_test_split()
    
    train_matrix = train_data.pivot_table(index='user_id', columns='item_id', values='rating')
    train_matrix = train_matrix.fillna(0)
    
    all_users = user_item_matrix.index
    all_movies = user_item_matrix.columns
    train_matrix = train_matrix.reindex(index=all_users, columns=all_movies, fill_value=0)
    
    precision_scores = []
    recall_scores = []
    ndcg_scores = []
    
    test_users_with_ratings = []
    for user_id in test_data['user_id'].unique():
        if user_id in train_matrix.index:
            if train_matrix.loc[user_id].sum() > 0:
                user_test = test_data[(test_data['user_id'] == user_id) & (test_data['rating'] >= 4)]
                if len(user_test) > 0:
                    test_users_with_ratings.append(user_id)
    
    test_users_with_ratings = test_users_with_ratings[:num_users]
    
    print(f"\nEvaluating {method} method on {len(test_users_with_ratings)} users...")
    
    evaluated_count = 0
    for user_id in test_users_with_ratings:
        if method == 'item_based':
            movie_ids = item_based_collaborative_filtering(user_id, k, use_matrix=train_matrix)
        elif method == 'svd':
            movie_ids = matrix_factorization_svd(user_id, k, use_matrix=train_matrix)
        elif method == 'hybrid':
            movie_ids = hybrid_recommendation(user_id, k, use_matrix=train_matrix)
        else:
            movie_ids = matrix_factorization_svd(user_id, k, use_matrix=train_matrix)
        
        if len(movie_ids) > 0:
            precision = precision_at_k(user_id, movie_ids, test_data, k)
            recall = recall_at_k(user_id, movie_ids, test_data, k)
            ndcg = ndcg_at_k(user_id, movie_ids, test_data, k)
            
            precision_scores.append(precision)
            recall_scores.append(recall)
            ndcg_scores.append(ndcg)
            evaluated_count += 1
    
    if evaluated_count == 0:
        print(f"WARNING: No recommendations generated for {method}")
        return {'precision': 0, 'recall': 0, 'ndcg': 0}
    
    print(f"Successfully evaluated {evaluated_count} users")
    print(f"\n{method.upper()} Results:")
    print(f"Precision@{k}: {np.mean(precision_scores):.4f}")
    print(f"Recall@{k}: {np.mean(recall_scores):.4f}")
    print(f"NDCG@{k}: {np.mean(ndcg_scores):.4f}")
    
    return {
        'precision': np.mean(precision_scores),
        'recall': np.mean(recall_scores),
        'ndcg': np.mean(ndcg_scores)
    }

In [77]:
test_user = 196

print(f"\nRecommendations for User {test_user}:\n")

# Item-Based CF
print("1. Item-Based Collaborative Filtering:")
recs_cf = recommend_movies(test_user, N=10, method='item_based')
for idx, (movie_id, title) in enumerate(recs_cf, 1):
    print(f"   {idx}. {title}")

# SVD
print("\n2. Matrix Factorization (SVD):")
recs_svd = recommend_movies(test_user, N=10, method='svd')
for idx, (movie_id, title) in enumerate(recs_svd, 1):
    print(f"   {idx}. {title}")

print("MODEL EVALUATION")

# Evaluate all methods (skip hybrid as it's optional)
results_cf = evaluate_model(method='item_based', k=10, num_users=50)
results_svd = evaluate_model(method='svd', k=10, num_users=50)

# Compare results

print("COMPARISON SUMMARY")

print(f"\n{'Method':<20} {'Precision@10':<15} {'Recall@10':<15} {'NDCG@10':<15}")
print("-"*65)
print(f"{'Item-Based CF':<20} {results_cf['precision']:<15.4f} {results_cf['recall']:<15.4f} {results_cf['ndcg']:<15.4f}")
print(f"{'SVD':<20} {results_svd['precision']:<15.4f} {results_svd['recall']:<15.4f} {results_svd['ndcg']:<15.4f}")



Recommendations for User 196:

1. Item-Based Collaborative Filtering:
   1. Very Natural Thing, A (1974)
   2. Walk in the Sun, A (1945)
   3. New York Cop (1996)
   4. Death in Brunswick (1991)
   5. Delta of Venus (1994)
   6. Sunchaser, The (1996)
   7. War at Home, The (1996)
   8. Dingo (1992)
   9. Body Snatchers (1993)
   10. Blue Sky (1994)

2. Matrix Factorization (SVD):
   1. When Harry Met Sally... (1989)
   2. Back to the Future (1985)
   3. Grease (1978)
   4. Dave (1993)
   5. Sleepless in Seattle (1993)
   6. Postino, Il (1994)
   7. Sabrina (1995)
   8. M*A*S*H (1970)
   9. Clueless (1995)
   10. Ulee's Gold (1997)
MODEL EVALUATION

Evaluating item_based method on 50 users...
Successfully evaluated 50 users

ITEM_BASED Results:
Precision@10: 0.0380
Recall@10: 0.0209
NDCG@10: 0.0367

Evaluating svd method on 50 users...
Successfully evaluated 50 users

SVD Results:
Precision@10: 0.1220
Recall@10: 0.1150
NDCG@10: 0.1578
COMPARISON SUMMARY

Method               Precision@