### 잠재요인 협업 필터링

In [1]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095450 sha256=7c338431020e94c90fb192568000107c10d82227c5f120ee635742f8472327fa
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
# 데이터 파일 업로드
from google.colab import files
up = files.upload()

Saving ratings.csv to ratings.csv
Saving movies.csv to movies.csv
Saving ratings_noh.csv to ratings_noh.csv


In [3]:
import pandas as pd
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# 전체 데이터의 갯수, 사용자 수, 영화 수
ratings.shape[0], ratings.userId.nunique(), ratings.movieId.nunique()

(100836, 610, 9724)

#### 모델 생성 및 학습

In [7]:
from surprise import SVD, Reader
from surprise.dataset import DatasetAutoFolds
reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5,5))
data_folds = DatasetAutoFolds('ratings_noh.csv',reader=reader)

In [8]:
trainset = data_folds.build_full_trainset()

In [10]:
model = SVD(n_epochs=20, n_factors=50, random_state=2023)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f3a22b5eda0>

In [12]:
mdf = pd.read_csv('movies.csv')
mdf[mdf.movieId == 42]

Unnamed: 0,movieId,title,genres
38,42,Dead Presidents (1995),Action|Crime|Drama


In [14]:
# userId = 9 인 사용자가 movieId=42 인 영화를 보았는지 확인
movieIds = ratings[ratings.userId == 9]['movieId']
movieIds[movieIds == 42].count()

0

In [15]:
try:
  movieIds[42]
except:
  print('42번 영화를 보지 않았습니다.')

42번 영화를 보지 않았습니다.


In [16]:
pred = model.predict(uid='9',iid='42',verbose=True)

user: 9          item: 42         r_ui = None   est = 2.96   {'was_impossible': False}


In [17]:
pred

Prediction(uid='9', iid='42', r_ui=None, est=2.958615033183178, details={'was_impossible': False})

- 사용자 id 9번이 보지 않은 영화중에서 예상점수가 가장 높은 Top 10

In [18]:
total_movies = mdf.movieId.tolist()
seen_movies = ratings[ratings.userId == 9]['movieId'].tolist()
unseen_movies = [movie for movie in total_movies if movie not in seen_movies]
len(seen_movies), len(unseen_movies)

(46, 9696)

In [19]:
predictions = []
for mid in unseen_movies:
    pred = model.predict(uid='9', iid=str(mid))
    predictions.append(pred)
predictions[:5]

[Prediction(uid='9', iid='1', r_ui=None, est=3.4776519829512216, details={'was_impossible': False}),
 Prediction(uid='9', iid='2', r_ui=None, est=3.2112541390330303, details={'was_impossible': False}),
 Prediction(uid='9', iid='3', r_ui=None, est=2.8472928958780246, details={'was_impossible': False}),
 Prediction(uid='9', iid='4', r_ui=None, est=2.729956004026319, details={'was_impossible': False}),
 Prediction(uid='9', iid='5', r_ui=None, est=2.610270730107314, details={'was_impossible': False})]

In [20]:
def sortkey_est(pred):
    return pred.est

In [21]:
predictions.sort(key=sortkey_est, reverse=True)
predictions[:5]

[Prediction(uid='9', iid='2959', r_ui=None, est=4.103662366606615, details={'was_impossible': False}),
 Prediction(uid='9', iid='7361', r_ui=None, est=4.095549306544156, details={'was_impossible': False}),
 Prediction(uid='9', iid='858', r_ui=None, est=4.0944254123065456, details={'was_impossible': False}),
 Prediction(uid='9', iid='260', r_ui=None, est=4.093231006166612, details={'was_impossible': False}),
 Prediction(uid='9', iid='246', r_ui=None, est=4.062247437031265, details={'was_impossible': False})]

In [22]:
top_movie_ids = [int(pred.iid) for pred in predictions[:10]]
top_movie_ratings = [pred.est for pred in predictions[:10]]
top_movie_titles = [mdf[mdf.movieId == mid]['title'].values[0] for mid in top_movie_ids]

In [24]:
df = pd.DataFrame({
    '영화명': top_movie_titles,
    '예상평점': top_movie_ratings
}, index=top_movie_ids)
df.index.name = 'movieId'
df

Unnamed: 0_level_0,영화명,예상평점
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2959,Fight Club (1999),4.103662
7361,Eternal Sunshine of the Spotless Mind (2004),4.095549
858,"Godfather, The (1972)",4.094425
260,Star Wars: Episode IV - A New Hope (1977),4.093231
246,Hoop Dreams (1994),4.062247
1196,Star Wars: Episode V - The Empire Strikes Back...,4.059472
7153,"Lord of the Rings: The Return of the King, The...",4.058896
1208,Apocalypse Now (1979),4.051314
6711,Lost in Translation (2003),4.044588
318,"Shawshank Redemption, The (1994)",4.039564


In [27]:
mdf[mdf.title == 'Godfather, The (1972)']

Unnamed: 0,movieId,title,genres
659,858,"Godfather, The (1972)",Crime|Drama


- 사용자 ID를 제공하면 추천 영화를 반환하는 함수

In [28]:
def get_top10_movies(uid):      # uid 정수
  total_movies = mdf.movieId.tolist()
  seen_movies = ratings[ratings.userId == str(uid)]['movieId'].tolist()
  unseen_movies = [movie for movie in total_movies if movie not in seen_movies]
  predictions = []
  for mid in unseen_movies:
      pred = model.predict(uid=str(uid), iid=str(mid))
      predictions.append(pred)
  predictions.sort(key=sortkey_est, reverse=True)
  top_movie_ids = [int(pred.iid) for pred in predictions[:10]]
  top_movie_ratings = [pred.est for pred in predictions[:10]]
  top_movie_titles = [mdf[mdf.movieId == mid]['title'].values[0] for mid in top_movie_ids]
  df = pd.DataFrame({
      '영화명': top_movie_titles,
      '예상평점': top_movie_ratings
  }, index=top_movie_ids)
  df.index.name = 'movieId'
  return df


In [29]:
get_top10_movies(100)

Unnamed: 0_level_0,영화명,예상평점
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
904,Rear Window (1954),4.694956
1258,"Shining, The (1980)",4.66172
898,"Philadelphia Story, The (1940)",4.661173
475,In the Name of the Father (1993),4.647624
1945,On the Waterfront (1954),4.634245
3275,"Boondock Saints, The (2000)",4.629319
5690,Grave of the Fireflies (Hotaru no haka) (1988),4.622983
58559,"Dark Knight, The (2008)",4.61544
2324,Life Is Beautiful (La Vita è bella) (1997),4.613965
1204,Lawrence of Arabia (1962),4.605477
