# Collaborative Filtering

Memory-Based Algorithm
- Item based (더 많이 함) <-- dot production없이 유사도를 기반으로 주로 활용됨
- User based


Model-Based Algorithm
- Latent Factor 협업 필터링 방법 (Matrix Factorization)

# 구글 드라이브 연결

In [1]:
import os
from google.colab import drive
drive.mount('/content/gdrive')

path = "gdrive/MyDrive/추천시스템/수업/"
os.listdir(path)

Mounted at /content/gdrive


['ratings.csv',
 'links.csv',
 'tags.csv',
 'movies.csv',
 '무비렌즈 데이터 전처리.ipynb',
 'movies_refined.csv',
 'ratings_refined.csv',
 '협업 필터링 - User-based with dot production.ipynb',
 '협업 필터링 - Item-based.ipynb',
 'user_based_cf_prediction.csv',
 '협업 필터링 - User-based.ipynb']

In [2]:
import pandas as pd
import numpy as np

# 데이터 로드

In [3]:
ratings = pd.read_csv(path + "ratings_refined.csv", usecols=['userId', 'movieId', 'rating']) # 사용할 컬럼만 적용
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100784,610,166534,4.0
100785,610,168248,5.0
100786,610,168250,5.0
100787,610,168252,5.0


In [4]:
pd.read_csv(path + "movies_refined.csv", nrows = 3) # csv 파일 내용 확인

Unnamed: 0,movieId,title,year,genres
0,1,Toy Story,1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji,1995,Adventure Children Fantasy
2,3,Grumpier Old Men,1995,Comedy Romance


In [5]:
movies = pd.read_csv(path + "movies_refined.csv", usecols=['movieId', 'title'])
movies

Unnamed: 0,movieId,title
0,1,Toy Story
1,2,Jumanji
2,3,Grumpier Old Men
3,4,Waiting to Exhale
4,5,Father of the Bride Part II
...,...,...
9698,193581,Black Butler: Book of the Atlantic
9699,193583,No Game No Life: Zero
9700,193585,Flint
9701,193587,Bungo Stray Dogs: Dead Apple


In [6]:
df = pd.merge(ratings, movies, on='movieId', how='left')
df

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story
1,1,3,4.0,Grumpier Old Men
2,1,6,4.0,Heat
3,1,47,5.0,Seven (a.k.a. Se7en)
4,1,50,5.0,"Usual Suspects, The"
...,...,...,...,...
100784,610,166534,4.0,Split
100785,610,168248,5.0,John Wick: Chapter Two
100786,610,168250,5.0,Get Out
100787,610,168252,5.0,Logan


null 값 체크

In [7]:
df.columns[df.isna().any()].tolist()

[]

영화명 결측치 체크

In [8]:
df[df['title'].isnull()]

Unnamed: 0,userId,movieId,rating,title


In [9]:
df['title'] = df['title'].str.replace("'", '')

# Item-based CF

## 영화 유사도 행렬 준비

In [10]:
movie_user = df.pivot_table(values='rating', index='title', columns='userId')
movie_user

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer,,,,,,,,,,,...,,,,,,,,,,3.5
*batteries not included,,,,,,,,,,,...,,,,,,,,,,
...All the Marbles,,,,,,,,,,,...,,,,,,,,,,
...And Justice for All,,,,,,,,,,,...,,,,,,,,,,
00 Schneider - Jagd auf Nihil Baxter,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
night Mother,,,,,,,,,,,...,,,,,,,,,,
xXx,,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0
xXx: State of the Union,,,,,,,,,,,...,,,,,,,,,,1.5
¡Three Amigos!,4.0,,,,,,,,,,...,,,,,,,,,,


In [11]:
# 9413 x 610 행렬
# 영화를 610차원의 벡터로 보려는 것
movie_user.shape

(9413, 610)

## 결측치 처리

null값이 있으면 cosine similarity 함수가 안돌아감

하지만, null값을 0으로 치환하고 계산할경우 결과가 달라짐

(마치 해당 영화를 보고 0점을 준것으로 계산)

In [12]:
movie_user_tmp = movie_user.copy().fillna(0)
movie_user_tmp

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
*batteries not included,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...All the Marbles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...And Justice for All,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00 Schneider - Jagd auf Nihil Baxter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
night Mother,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
xXx: State of the Union,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5
¡Three Amigos!,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 유사도 행렬 계산
#### 유사도 검사는 NULL 값이 있을 시 사용불가

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

movie_similarity_matrix = cosine_similarity(movie_user_tmp)
movie_similarity_matrix.shape

(9413, 9413)

원하는 항목에 대한 dot_production 하는 곳

In [58]:
# 유저-유저 영화관람 내역에 대한 유사도
movie_ids = movie_user.index

item_similarity = pd.DataFrame(movie_similarity_matrix,
                                index=movie_ids, columns=movie_ids)
print(item_similarity.shape)
item_similarity.head()

(9413, 9413)


title,(500) Days of Summer,*batteries not included,...All the Marbles,...And Justice for All,00 Schneider - Jagd auf Nihil Baxter,1-900 (06),10,10 Cent Pistol,10 Cloverfield Lane,10 Items or Less,...,[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,"burbs, The",eXistenZ,night Mother,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer,1.0,0.155663,0.0,0.0,0.161889,0.0,0.05072,0.09816,0.173843,0.125232,...,0.076961,0.100164,0.0,0.099735,0.097935,0.0,0.276512,0.156764,0.169385,0.0
*batteries not included,0.155663,1.0,0.0,0.0,0.0,0.0,0.142816,0.276395,0.050778,0.0,...,0.0,0.0,0.0,0.248271,0.132489,0.0,0.019862,0.0,0.249586,0.0
...All the Marbles,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
...And Justice for All,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.031944,0.09673,0.447214,0.0,0.0,0.080502,0.0
00 Schneider - Jagd auf Nihil Baxter,0.161889,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# 데이터프레임 df에서 'title' 열의 중복 제거
unique_titles_df = df.drop_duplicates(subset='title')

# unique_titles_df의 'title'과 item_similarity의 인덱스를 비교하여 일치하는지 확인
matching_titles = unique_titles_df['title'].isin(item_similarity.index)

# 일치하지 않는 제목이 있는지 확인
non_matching_titles = unique_titles_df[~matching_titles]

# 일치하는 제목과 일치하지 않는 제목의 개수 출력
print(f"Matching titles count: {matching_titles.sum()}")
print(f"Non-matching titles count: {non_matching_titles.shape[0]}")

# 일치하지 않는 제목이 있다면, 그 제목들을 출력
if non_matching_titles.shape[0] > 0:
    print("Titles in df not in item_similarity index:")
    print(non_matching_titles['title'].tolist())


Matching titles count: 9413
Non-matching titles count: 0


In [16]:
df

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story
1,1,3,4.0,Grumpier Old Men
2,1,6,4.0,Heat
3,1,47,5.0,Seven (a.k.a. Se7en)
4,1,50,5.0,"Usual Suspects, The"
...,...,...,...,...
100784,610,166534,4.0,Split
100785,610,168248,5.0,John Wick: Chapter Two
100786,610,168250,5.0,Get Out
100787,610,168252,5.0,Logan


평점 데이터 기반

In [32]:
from tqdm.notebook import tqdm

def modeling(similarity_matrix, data):
    df_pred_all = pd.DataFrame()
    all_users = sorted(data['userId'].unique())
    titles = similarity_matrix.columns
    n_titles = len(titles)

    for user_id in tqdm(all_users):
        idx = data[data['userId'] == user_id].index

        # 유사도
        watched_movie = data.loc[idx, 'title'].tolist()
        sub_sim_mat = similarity_matrix.loc[watched_movie]
        sub_sim_mat = sub_sim_mat.T.to_numpy()
        sim_N = np.sum(sub_sim_mat, axis=1) + 1


        # 평점 예측
        watched_movie_y = data.loc[idx, 'rating']
        watched_movie_y = np.array(watched_movie_y.tolist()).reshape(-1, 1)

        pred_y = np.matmul(sub_sim_mat, watched_movie_y).flatten() / sim_N

        user_list = [user_id] * n_titles
        cur_pred = pd.DataFrame(zip(titles, user_list, pred_y),
                                columns=['title', 'userId', 'pred_rating'])

        # 결과 기록
        df_pred_all = pd.concat([df_pred_all, cur_pred], axis=0)
    return df_pred_all

# df_pred_all = modeling(item_similarity, df)
# df_pred_all

In [33]:
df_pred_all = modeling(item_similarity, df)
df_pred_all

  0%|          | 0/610 [00:00<?, ?it/s]

Unnamed: 0,title,userId,pred_rating
0,(500) Days of Summer,1,4.247019
1,*batteries not included,1,4.224252
2,...All the Marbles,1,2.701572
3,...And Justice for All,1,4.163832
4,00 Schneider - Jagd auf Nihil Baxter,1,3.891061
...,...,...,...
9408,night Mother,610,3.860681
9409,xXx,610,3.613897
9410,xXx: State of the Union,610,3.573156
9411,¡Three Amigos!,610,3.818830


In [36]:
from sklearn.model_selection import train_test_split

_, test_data = train_test_split(df[['title', 'userId', 'rating']],
                test_size=20000, random_state=1234, stratify=df['userId']) # 2만개 샘플링
test_data

Unnamed: 0,title,userId,rating
72039,Escape from L.A.,464,3.0
90528,Carlitos Way,590,4.0
20360,"Nightmare Before Christmas, The",135,4.0
42826,"Naked Gun 2 1/2: The Smell of Fear, The",288,4.0
37923,Guardians of the Galaxy 2,258,3.5
...,...,...,...
94958,Thor: Ragnarok,599,3.0
63609,Revenge of the Nerds II: Nerds in Paradise,414,2.0
68727,"Wild Bunch, The",448,5.0
38371,Gosford Park,263,4.5


In [37]:
print(test_data['title'].dtype, df_pred_all['title'].dtype)
print(test_data['userId'].dtype, df_pred_all['userId'].dtype)


object object
int64 int64


In [38]:
test_data = pd.merge(test_data, df_pred_all, on=['title', 'userId'], how='left') # userid와 title을 기준으로 합치기
test_data

Unnamed: 0,title,userId,rating,pred_rating
0,Escape from L.A.,464,3.0,3.625396
1,Carlitos Way,590,4.0,3.403446
2,"Nightmare Before Christmas, The",135,4.0,3.665502
3,"Naked Gun 2 1/2: The Smell of Fear, The",288,4.0,3.180508
4,Guardians of the Galaxy 2,258,3.5,3.517372
...,...,...,...,...
19995,Thor: Ragnarok,599,3.0,2.696064
19996,Revenge of the Nerds II: Nerds in Paradise,414,2.0,3.396107
19997,"Wild Bunch, The",448,5.0,2.850886
19998,Gosford Park,263,4.5,3.714964


In [39]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

true_y = np.array(test_data['rating'])
pred_y = np.array(test_data['pred_rating'])

mae = mean_absolute_error(y_true=true_y, y_pred=pred_y)
mse = mean_squared_error(y_true=true_y, y_pred=pred_y)
rmse = np.sqrt(mse)

print(f"MAE  : {str(round(mae, 2))}")
print(f"MSE  : {str(round(mse, 2))}")
print(f"RMSE : {str(round(rmse, 2))}")

MAE  : 0.7
MSE  : 0.8
RMSE : 0.89


## ALS 알고리즘을 이용한 추천

In [72]:
!pip install pyspark
!pip install graphframes

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=1e21f91a15a5202c016bd941bd093f85ea50f29407b3680e964859d75bab7030
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
!curl -L -o "/usr/local/lib/python3.6/dist-packages/pyspark/jars/graphframes-0.8.2-spark3.2-s_2.12.jar" http://dl.bintray.com/spark-packages/maven/graphframes/graphframes/0.8.2-spark3.2-s_2.12/graphframes-0.8.2-spark3.2-s_2.12.jar

In [None]:
!apt-get install openjdk-8-jdk-headless -qq

In [78]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from sklearn.model_selection import train_test_split
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ALS Example") \
    .getOrCreate()

# user_id, product_id, total_score 3개의 열만 추출하여 Dataframe 생성
df_data = spark.createDataFrame(df_pred_all)


In [82]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

# 사용자 ID와 아이템 ID(예: 영화 제목)를 숫자형으로 변환
user_indexer = StringIndexer(inputCol="userId", outputCol="user_id")
item_indexer = StringIndexer(inputCol="title", outputCol="item_id")
# Pipeline 생성
pipeline = Pipeline(stages=[user_indexer, item_indexer])
# 데이터 변환 실행
transformed_data = pipeline.fit(df_data).transform(df_data)
train_spark, test_spark = transformed_data.randomSplit([0.8, 0.2])


In [87]:
train_spark

DataFrame[title: string, userId: bigint, pred_rating: double, user_id: double, item_id: double]

In [90]:
# 훈련 데이터와 테스트 데이터로 분할
rec = ALS(maxIter=10,
         regParam=0.01,
         userCol='user_id',
         itemCol='item_id',
         ratingCol='pred_rating',
         nonnegative=True,
         coldStartStrategy='drop')
# ALS모델 학습
rec_model = rec.fit(train_spark)

# transform을 이용해 예측
prediction = rec_model.transform(test_spark)

In [92]:
# 예측 성능 평가(rmse)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="pred_rating", predictionCol="prediction")

# evaluate 메소드에 예측값 담겨있는 dataframe 넣어주기
rmse = evaluator.evaluate(prediction)

print("RMSE:", rmse)

RMSE: 0.1766036223591793


Coverage

3점 이상인 경우 영화를 봤다고 가정 (혹은 추천했다고 가정)

In [40]:
# 모델이 추천한 영화 개수
n_recommends = sum(1 * (pred_y > 4.0))
n_recommends

2193

In [41]:
n_movies = df['title'].nunique()

In [42]:
# Coverage
n_recommends / n_movies

0.23297567194305746

In [43]:
def get_precision(true_y, pred_y, threshold): # 정확도 계산
    trues = 1 * (true_y >= threshold)
    n_trues = sum(trues)

    pred_trues = 1 * (pred_y >= threshold)

    true_positive = sum(trues + pred_trues == 2)

    precision = true_positive / n_trues

    return precision

In [44]:
get_precision(true_y, pred_y, 3)

0.9151100425374514

In [45]:
df_pred_all.to_csv(path + "user_based_cf_prediction.csv", index=False)

In [57]:
# 높은수록 비슷한 유저
# 가장 높은 유저는 본인 자신 [0] index 제외
movie_id = 'eXistenZ'
item_similarity[movie_id].sort_values(ascending=False)[1:10]

title
Happy Accidents                  0.499923
Dummy                            0.433494
Sleepy Hollow                    0.421749
Quills                           0.411991
Shape of Things, The             0.411982
Metropolis                       0.408068
Vanishing, The                   0.406521
Amores Perros (Loves a Bitch)    0.402072
Doom Generation, The             0.401398
Name: eXistenZ, dtype: float64

## 데이터 프레임화

In [50]:
movie_titles = movie_user.index
movie_titles

Index(['(500) Days of Summer', '*batteries not included', '...All the Marbles',
       '...And Justice for All', '00 Schneider - Jagd auf Nihil Baxter',
       '1-900 (06)', '10', '10 Cent Pistol', '10 Cloverfield Lane',
       '10 Items or Less',
       ...
       '[REC]²', '[REC]³ 3 Génesis',
       'anohana: The Flower We Saw That Day - The Movie', 'burbs, The',
       'eXistenZ', 'night Mother', 'xXx', 'xXx: State of the Union',
       '¡Three Amigos!', 'À nous la liberté (Freedom for Us)'],
      dtype='object', name='title', length=9413)

Arr을 dataframe으로 변경

# 영화 유사도 기반 추천

In [60]:
# 샘플 사용자
user = 1
# 샘플 사용자가 본 영화 중 임의로 한 개 선택
movie_title = df.loc[df['userId'] == user, 'title'][0] # 해당 유저가 본 영화를 한 개 선택
movie_title

'Toy Story'

In [61]:
# 사용자가 본 영화와 유사도가 높은 영화 10개 추천 함수
def get_recomendation(title):
    return item_similarity[title].sort_values(ascending=False)[1:11] # 본인을 포함하지 않았기에 1부터

In [62]:
get_recomendation(movie_title)

title
Toy Story 2                                   0.572601
Jurassic Park                                 0.565637
Independence Day (a.k.a. ID4)                 0.564262
Star Wars: Episode IV - A New Hope            0.557388
Forrest Gump                                  0.547096
Lion King, The                                0.541145
Star Wars: Episode VI - Return of the Jedi    0.541089
Mission: Impossible                           0.538913
Groundhog Day                                 0.534169
Back to the Future                            0.530381
Name: Toy Story, dtype: float64

In [66]:
# 사용자가 안 본 영화 중 예측 평점 높은 10개 추천 (아이템 기반)
def get_recomendation_item_based(user_id):
    # 사용자가 평가한 영화
    user_rated_movies = df[df['userId'] == user_id]['title'].tolist()

    # 사용자에 대한 모든 영화의 예측 평점
    user_pred_ratings = df_pred_all[df_pred_all['userId'] == user_id]

    # 사용자가 평가하지 않은 영화만 필터링
    unrated_movies = user_pred_ratings[~user_pred_ratings['title'].isin(user_rated_movies)]

    # 예측 평점이 높은 상위 10개 영화 추천
    recommended_movies = unrated_movies.sort_values(by='pred_rating', ascending=False).head(10)

    return recommended_movies['title'].tolist()


In [71]:
user_id = 1
get_recomendation_item_based(user_id)

['Unforgiven',
 'Batman Begins',
 'Lord of the Rings: The Two Towers, The',
 'Dark Knight, The',
 'Lord of the Rings: The Return of the King, The',
 'Ferris Buellers Day Off',
 'This Is Spinal Tap',
 'Star Trek',
 'Lord of the Rings: The Fellowship of the Ring, The',
 'Royal Tenenbaums, The']

In [None]:
from tqdm.notebook import tqdm

def modeling_item_based(item_similarity_matrix, data):
    df_pred_all = pd.DataFrame()
    all_users = sorted(data['userId'].unique())
    titles = item_similarity_matrix.columns
    n_titles = len(titles)
    for user_id in tqdm(all_users):
        # 사용자가 평가한 영화
        watched_movies = data[data['userId'] == user_id]['movieId'].tolist()

        # 사용자가 평가하지 않은 영화
        unwatched_movies = item_similarity_matrix.columns.difference(watched_movies)

        for movie_id in unwatched_movies:
            # 영화 간 유사도
            movie_similarity = item_similarity_matrix.loc[watched_movies, movie_id]
            # 사용자의 평점
            user_ratings = data[(data['userId'] == user_id) & (data['movieId'].isin(watched_movies))]['rating']

            # 평점 예측
            pred_rating = np.dot(movie_similarity, user_ratings) / movie_similarity.sum()

            # 결과 기록
            df_pred_all = pd.concat([df_pred_all, pd.DataFrame({'title': movie_id, 'userId': user_id, 'pred_rating': pred_rating}, index=[0])], axis=0)

    return df_pred_all

In [None]:
df_pred_all = modeling_item_based(item_similarity, df)
df_pred_all