<a href="https://colab.research.google.com/github/KevinTheRainmaker/Recommendation_Algorithms/blob/main/Compact_MF_using_module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/data/movielens'
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')

!pip install -q matrix-factorization
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split

def get_svd(s_matrix, k=300):
  u, s, vt = np.linalg.svd(s_matrix.transpose()) # left singular vector u / right singular vector vt
  S = s[:k] * np.identity(k, np.float)
  T = u[:, :k]
  Dt = vt[:k, :]

  item_factors = np.transpose(np.matmul(S,Dt))
  user_factors = np.transpose(T)

  return item_factors, user_factors

def evaluate(test_df, prediction_result_df):
  groups_with_movie_ids = test_df.groupby(by='movieId')
  groups_with_user_ids = test_df.groupby(by='userId')
  intersection_movie_ids = sorted(list(set(list(prediction_result_df.columns)).intersection(set(list(groups_with_movie_ids.indices.keys())))))
  intersection_user_ids = sorted(list(set(list(prediction_result_df.index)).intersection(set(groups_with_user_ids.indices.keys()))))

  print(len(intersection_movie_ids))
  print(len(intersection_user_ids))

  compressed_prediction_df = prediction_result_df.loc[intersection_user_ids][intersection_movie_ids]

  # test_df에 대해서 RMSE 계산
  grouped = test_df.groupby(by='userId')
  rmse_df = pd.DataFrame(columns=['rmse'])
  for userId, group in tqdm(grouped):
      if userId in intersection_user_ids:
          pred_ratings = compressed_prediction_df.loc[userId][compressed_prediction_df.loc[userId].index.intersection(list(group['movieId'].values))]
          pred_ratings = pred_ratings.to_frame(name='rating').reset_index().rename(columns={'index':'movieId','rating':'pred_rating'})
          actual_ratings = group[['rating', 'movieId']].rename(columns={'rating':'actual_rating'})

          final_df = pd.merge(actual_ratings, pred_ratings, how='inner', on=['movieId'])
          final_df = final_df.round(4) # 반올림
          
          if not final_df.empty:
            rmse = sqrt(mean_squared_error(final_df['actual_rating'], final_df['pred_rating']))
            rmse_df.loc[userId] = rmse

  return final_df, rmse_df

def find_best_k(sparse_matrix, maximum_k=100, minimum_k = 50, step = 10):
    print("\nFind best optimized k for Matrix Factorization")
    k_candidates = np.arange(minimum_k, maximum_k, step)
    final_df = pd.DataFrame(columns=['rmse'], index=k_candidates)
    for k in tqdm(k_candidates):
        item_factors, user_factors = get_svd(sparse_matrix, k)
        each_results_df = pd.DataFrame(np.matmul(item_factors, user_factors),
                                    columns=sparse_matrix.columns.values, index=sparse_matrix.index.values)
        each_results_df = each_results_df.transpose()
        
        result_df, _ = evaluate(test_df, each_results_df)
        each_rmse = sqrt(mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values))

        final_df.loc[k]['rmse'] = each_rmse
    
    return final_df.sort_values(by = 'rmse').index[0]

train_df, test_df = train_test_split(ratings_df, test_size=0.2)
train_df.rename(columns={"userId": "user_id", "movieId": "item_id"}, inplace = True)

sparse_matrix = train_df.groupby('item_id').apply(lambda x : pd.Series(x['rating'].values, index=x['user_id'])).unstack()
sparse_matrix.index.name = 'item_id'

# fill sparse matrix with average of movie ratings
# sparse_matrix_withmovie = sparse_matrix.apply(lambda x: x.fillna(x.mean()), axis=1)

# fill sparse matrix with average of user ratings
sparse_matrix_withuser = sparse_matrix.apply(lambda x: x.fillna(x.mean()), axis=0)

item_factors, user_factors = get_svd(sparse_matrix_withuser)
prediction_result_df = pd.DataFrame(np.matmul(item_factors, user_factors),
                                    columns=sparse_matrix_withuser.columns.values,
                                    index=sparse_matrix_withuser.index.values)

user_prediction_result_df = prediction_result_df.transpose()

best_k = find_best_k(sparse_matrix_withuser, maximum_k = 600, minimum_k = 100, step = 50)

print(f'\n----Best number of factors: {best_k}----\n')

(
    X_train_initial,
    y_train_initial,
    X_train_update,
    y_train_update,
    X_test_update,
    y_test_update,
) = train_update_test_split(train_df, frac_new_users=0.2)

# Initial training
matrix_fact = KernelMF(n_epochs=20, n_factors=best_k, verbose=0, lr=0.001, reg=0.005)
matrix_fact.fit(X_train_initial, y_train_initial)

# Update model with new users
matrix_fact.update_users(
    X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1
)

# Get recommendations
user = 1
users = [x for x in range(2, 611)]
items_known = X_train_initial.query("user_id == @user")["item_id"]
result = matrix_fact.recommend(user=1, items_known=items_known)
for user in tqdm(users):
  items_known = X_train_initial.query("user_id == @user")["item_id"]
  temp = matrix_fact.recommend(user=user, items_known=items_known)
  result = pd.concat([result, temp], axis = 1)

# Save
result.to_csv('./drive/MyDrive/data/trial_result.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Find best optimized k for Matrix Factorization


  0%|          | 0/10 [00:00<?, ?it/s]

4349
609


  0%|          | 0/609 [00:00<?, ?it/s]

4349
609


  0%|          | 0/609 [00:00<?, ?it/s]

4349
609


  0%|          | 0/609 [00:00<?, ?it/s]

4349
609


  0%|          | 0/609 [00:00<?, ?it/s]

4349
609


  0%|          | 0/609 [00:00<?, ?it/s]

4349
609


  0%|          | 0/609 [00:00<?, ?it/s]

4349
609


  0%|          | 0/609 [00:00<?, ?it/s]

4349
609


  0%|          | 0/609 [00:00<?, ?it/s]

4349
609


  0%|          | 0/609 [00:00<?, ?it/s]

4349
609


  0%|          | 0/609 [00:00<?, ?it/s]


----Best number of factors: 100----

Epoch  1 / 20  -  train_rmse: 0.9196027604063988
Epoch  2 / 20  -  train_rmse: 0.9065035594993474
Epoch  3 / 20  -  train_rmse: 0.8971463294224166
Epoch  4 / 20  -  train_rmse: 0.8901101305252521
Epoch  5 / 20  -  train_rmse: 0.8845719052973886
Epoch  6 / 20  -  train_rmse: 0.8800529381608354
Epoch  7 / 20  -  train_rmse: 0.8762569941178285
Epoch  8 / 20  -  train_rmse: 0.8729908847115703
Epoch  9 / 20  -  train_rmse: 0.8701246508332053
Epoch  10 / 20  -  train_rmse: 0.8675777572536201
Epoch  11 / 20  -  train_rmse: 0.8652751260053237
Epoch  12 / 20  -  train_rmse: 0.8631843580578711
Epoch  13 / 20  -  train_rmse: 0.8612587873330475
Epoch  14 / 20  -  train_rmse: 0.859463026886017
Epoch  15 / 20  -  train_rmse: 0.8577856138747881
Epoch  16 / 20  -  train_rmse: 0.8562091782241883
Epoch  17 / 20  -  train_rmse: 0.8547204358909141
Epoch  18 / 20  -  train_rmse: 0.8533062933909031
Epoch  19 / 20  -  train_rmse: 0.851956222552311
Epoch  20 / 20  -  trai

  0%|          | 0/609 [00:00<?, ?it/s]