In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD, NMF as SklearnNMF
from sklearn.preprocessing import Normalizer
from surprise import SVD, Dataset, Reader
from surprise import KNNBasic, NMF as SurpriseNMF
from surprise.model_selection import cross_validate

In [14]:
# 1. Load and preprocess data
drive.mount('/content/drive')
ratings = pd.read_csv('/content/drive/MyDrive/ratings_small.csv')
movies = pd.read_csv('/content/drive/MyDrive/tmdb_5000_movies.csv')
movies.rename(columns={'id': 'movieId'}, inplace=True)
ratings_movies = pd.merge(ratings, movies[['movieId', 'title']], on='movieId')
data = ratings_movies.pivot_table('rating', index='userId', columns='title').fillna(0)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# 2. SVD-based recommendation function (modified to output only movie titles)
def svd_recommendations(user_id, ratings_df, movies_df, top_n=10):
    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
    svd = SVD()
    trainset = data.build_full_trainset()
    svd.fit(trainset)

    # Generate predictions for all movies the user hasn't rated yet
    user_ratings = []
    for movie_id in ratings_df['movieId'].unique():
        pred = svd.predict(user_id, movie_id)
        user_ratings.append((movie_id, pred.est))

    # Sort movies by predicted rating in descending order and select the top_n recommendations
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_movie_predictions = user_ratings[:top_n]

    # Extract only the movie titles
    recommended_titles = [
        movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
        for movie_id, rating in top_movie_predictions
        if not movies_df[movies_df['movieId'] == movie_id].empty
    ]

    return recommended_titles

In [16]:
# 3. KNN-based recommendation function
user_movie_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
knn = NearestNeighbors(n_neighbors=6, algorithm='auto')
knn.fit(user_movie_matrix.T)

def knn_recommendations(movie_id, user_movie_matrix, movies_df, num_recommendations=5):
    knn = NearestNeighbors(n_neighbors=num_recommendations+1, algorithm='auto')
    knn.fit(user_movie_matrix.T)
    movie_idx = user_movie_matrix.columns.get_loc(movie_id)
    distances, indices = knn.kneighbors(user_movie_matrix.iloc[:, movie_idx].values.reshape(1, -1), n_neighbors=num_recommendations+1)
    recommended_movie_ids = user_movie_matrix.columns[indices.flatten()][1:]
    recommended_titles = movies_df[movies_df['movieId'].isin(recommended_movie_ids)]['title'].values

    return recommended_titles


In [17]:
# 4. NMF-based recommendation function
sklearn_nmf = SklearnNMF(n_components=20, init='random', random_state=42)
nmf_matrix = sklearn_nmf.fit_transform(user_movie_matrix)
nmf_matrix_normalized = Normalizer().fit_transform(nmf_matrix)

def nmf_recommendations(movie_id, user_movie_matrix, movies_df, num_recommendations=5):
    # Normalize the NMF matrix
    nmf_matrix_normalized = Normalizer().fit_transform(sklearn_nmf.fit_transform(user_movie_matrix))

    # Get the index of the movie
    movie_idx = user_movie_matrix.columns.get_loc(movie_id)

    # Extract the movie vector from the NMF matrix
    movie_vector = nmf_matrix_normalized[movie_idx, :]

    # Compute similarity scores with all movies
    similarity_scores = np.dot(nmf_matrix_normalized, movie_vector)

    # Get the indices of the most similar movies
    recommended_idx = similarity_scores.argsort()[-num_recommendations-1:-1]

    # Get the recommended movie ids and titles
    recommended_movie_ids = user_movie_matrix.columns[recommended_idx]
    recommended_titles = movies_df[movies_df['movieId'].isin(recommended_movie_ids)]['title'].values

    return recommended_titles

In [18]:
# 5. Performance evaluation function
def evaluate_model(predictions, true_ratings):
    rmse = mean_squared_error(true_ratings, predictions, squared=False)
    mae = mean_absolute_error(true_ratings, predictions)
    return rmse, mae

In [19]:
# 6. Model evaluation and recommendation results
# SVD performance evaluation
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()

print("\nSVD 추천 결과")
print(svd_recommendations(1, ratings, movies))

# KNN recommendation results
print("\nKNN 추천 결과")
print(knn_recommendations(1, user_movie_matrix, movies))

# NMF recommendation results
print("\nNMF 추천 결과")
print(nmf_recommendations(1, user_movie_matrix, movies))


SVD 추천 결과
["Pandora's Box", 'Galaxy Quest']

KNN 추천 결과
['Bridge to Terabithia' 'Reign Over Me']

NMF 추천 결과
['Blade Runner' 'Aliens' 'Pulp Fiction' 'D.E.B.S.']


In [20]:
# 7. Model evaluation across different algorithms
def evaluate_models(ratings_df):
    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

    # SVD model evaluation
    svd = SVD()
    svd_results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    # KNN model evaluation
    knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': True}, verbose=False)
    knn_results = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    # Surprise NMF model evaluation
    nmf = SurpriseNMF(n_factors=20, random_state=42)
    nmf_results = cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    # Results comparison
    results_df = pd.DataFrame({
        'Model': ['SVD', 'KNN', 'NMF'],
        'Mean RMSE': [np.mean(svd_results['test_rmse']),
                      np.mean(knn_results['test_rmse']),
                      np.mean(nmf_results['test_rmse'])],
        'Mean MAE': [np.mean(svd_results['test_mae']),
                     np.mean(knn_results['test_mae']),
                     np.mean(nmf_results['test_mae'])],
        'Fit Time': [np.mean(svd_results['fit_time']),
                     np.mean(knn_results['fit_time']),
                     np.mean(nmf_results['fit_time'])],
        'Test Time': [np.mean(svd_results['test_time']),
                      np.mean(knn_results['test_time']),
                      np.mean(nmf_results['test_time'])]
    })

    return results_df

In [21]:
# 8. Execute evaluation
evaluation_results = evaluate_models(ratings)
print(evaluation_results)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9021  0.8984  0.8986  0.9013  0.8880  0.8977  0.0050  
MAE (testset)     0.6931  0.6927  0.6928  0.6933  0.6855  0.6915  0.0030  
Fit time          1.36    1.33    1.32    1.33    2.04    1.48    0.28    
Test time         0.19    0.11    0.11    0.15    0.18    0.15    0.03    
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0007  0.9866  0.9865  0.9970  0.9945  0.9931  0.0057  
MAE (testset)     0.7686  0.7656  0.7637  0.7701  0.7676  0.7671  0.0023  
Fit time          0.25    0.17    0.17    0.16    0.16    0.18    0.03    
Test time         1.56    1.29    1.29    1.22    1.23    1.32    0.12    
Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (te