surprise 모듈을 설치


In [None]:
# %conda install -c conda-forge numpy pandas seaborn scikit-learn scikit-surprise



# 데이터 로딩


In [2]:
from surprise import Dataset
from surprise import Reader

from collections import defaultdict
import numpy as np
import pandas as pd

In [3]:
movies = pd.read_csv("data/movies.csv")
ratings = pd.read_csv("data/ratings.csv")

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
movie_ratings = pd.merge(movies, ratings, left_on='movieId', right_on='movieId')

In [7]:
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,3.0,851866703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9,4.0,938629179
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13,5.0,1331380058
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.0,997938310
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0,855190091


In [8]:
movies.shape

(9125, 3)

In [9]:
ratings.shape

(100004, 4)

In [10]:
movie_ratings.shape

(100004, 6)

In [11]:
def getMovieName(movie_ratings, movieID):
    return movie_ratings[movie_ratings["movieId"] == movieID][["title", "genres"]].values[0]

def getMovieID(movie_ratings, movieName):
    return movie_ratings[movie_ratings["title"] == movieName][["movieId", "genres"]].values[0]

## 영화 데이터를 surprise 모듈을 통해 로딩


In [13]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file("data/ratings.csv", reader=reader)

In [14]:
from surprise import KNNBasic
import heapq
from collections import defaultdict

In [15]:
# 데이터를 훈련용과 테스트용으로 나누지 말고 모두 리턴
trainSet = data.build_full_trainset()

In [16]:
sim_options = {
    'name': 'cosine',
    'user_based': False
}

# 아이템 기반과 코사인 유사도 기반으로 동작하는 모델 생성
model = KNNBasic(sim_options=sim_options)
# 데이터로부터 아이템 벡터 생성
model.fit(trainSet)
# 아이템간의 유사도 행렬 계산
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [17]:
testUserId = '85'
k = 10

In [21]:
pd.set_option('display.max_colwidth',1)
display(movie_ratings[movie_ratings["userId"] == int(testUserId)].sort_values("rating", ascending=False))

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
263,2,Jumanji (1995),Adventure|Children|Fantasy,85,5.0,837511784
7443,255,"Jerky Boys, The (1995)",Comedy,85,5.0,837512635
19363,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,85,5.0,837511448
17405,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,85,5.0,837507044
14386,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,85,5.0,837511743
...,...,...,...,...,...,...
16429,546,Super Mario Bros. (1993),Action|Adventure|Children|Comedy|Fantasy|Sci-Fi,85,1.0,837512482
11769,366,"Wes Craven's New Nightmare (Nightmare on Elm Street Part 7: Freddy's Finale, A) (1994)",Drama|Horror|Mystery|Thriller,85,1.0,837512580
12156,374,Richie Rich (1994),Children|Comedy,85,1.0,837512723
8420,291,Poison Ivy II (1996),Drama|Thriller,85,1.0,837512611


In [24]:
testUserInnerID = trainSet.to_inner_uid(testUserId)

# 이 사용자가 좋아한 영화 k개를 읽어서 kNeighbors에 저장
testUserRatings = trainSet.ur[testUserInnerID]
kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])

In [25]:
kNeighbors

[(650, 5.0),
 (20, 5.0),
 (27, 5.0),
 (4206, 5.0),
 (387, 5.0),
 (49, 5.0),
 (423, 5.0),
 (99, 5.0),
 (145, 5.0),
 (55, 5.0)]

In [28]:
# 이 사용자 좋아한 영화들과 비슷한 영화를 찾아서 candidates에 유사도 가중치를 곱해서 저장
candidates = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        candidates[innerID] += score * (rating)

In [29]:
# 사용자가 이미 본 아이템들을 기록
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
    watched[itemID] = 1

In [30]:
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=lambda t: t[1], reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(movieID, getMovieName(movie_ratings, int(movieID)), ratingSum)
        pos += 1
        if (pos > 10):
            break

3136 ['James Dean Story, The (1957)' 'Documentary'] 50.0
2607 ['Get Real (1998)' 'Drama|Romance'] 49.93620560356324
259 ['Kiss of Death (1995)' 'Crime|Drama|Thriller'] 49.83440938875971
998 ['Set It Off (1996)' 'Action|Crime'] 49.818661078285594
1935 ['How Green Was My Valley (1941)' 'Drama|Musical|Romance'] 49.71992040532635
1440 ['Amos & Andrew (1993)' 'Comedy'] 49.69868472501265
269 ['My Crazy Life (Mi vida loca) (1993)' 'Drama'] 49.691452437730206
988 ['Grace of My Heart (1996)' 'Comedy|Drama'] 49.631279483226095
2068 ['Fanny and Alexander (Fanny och Alexander) (1982)'
 'Drama|Fantasy|Mystery'] 49.628498357279526
896 ['Wild Reeds (Les roseaux sauvages) (1994)' 'Drama'] 49.58113202209386
2626 ['Edge of Seventeen (1998)' 'Comedy|Drama|Romance'] 49.565143823458385


In [33]:
def recommendForItem(userID):
    testUserInnerID = trainSet.to_inner_uid(userID)
    testUserRatings = trainSet.ur[testUserInnerID]

    kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])

    candidates = defaultdict(float)
    for itemID, rating in kNeighbors:
        similarityRow = simsMatrix[itemID]
        for innerID, score in enumerate(similarityRow):
            candidates[innerID] += score * (rating)

    watched = {}
    for itemID, rating in trainSet.ur[testUserInnerID]:
        watched[itemID] = 1

    pos = 0
    for itemID, ratingSum in sorted(candidates.items(), key=lambda t: t[1], reverse=True):
        if not itemID in watched:
            movieID = trainSet.to_raw_iid(itemID)
            print(movieID, getMovieName(movie_ratings, int(movieID)), ratingSum)
            pos += 1
            if (pos > 10):
                break

In [34]:
recommendForItem('85')

3136 ['James Dean Story, The (1957)' 'Documentary'] 50.0
2607 ['Get Real (1998)' 'Drama|Romance'] 49.93620560356324
259 ['Kiss of Death (1995)' 'Crime|Drama|Thriller'] 49.83440938875971
998 ['Set It Off (1996)' 'Action|Crime'] 49.818661078285594
1935 ['How Green Was My Valley (1941)' 'Drama|Musical|Romance'] 49.71992040532635
1440 ['Amos & Andrew (1993)' 'Comedy'] 49.69868472501265
269 ['My Crazy Life (Mi vida loca) (1993)' 'Drama'] 49.691452437730206
988 ['Grace of My Heart (1996)' 'Comedy|Drama'] 49.631279483226095
2068 ['Fanny and Alexander (Fanny och Alexander) (1982)'
 'Drama|Fantasy|Mystery'] 49.628498357279526
896 ['Wild Reeds (Les roseaux sauvages) (1994)' 'Drama'] 49.58113202209386
2626 ['Edge of Seventeen (1998)' 'Comedy|Drama|Romance'] 49.565143823458385


### 한방에 협업 필터링 아이템


In [1]:
from surprise import Dataset, Reader
from collections import defaultdict
import numpy as np
import pandas as pd
from surprise import KNNBasic
import heapq

# 영화와 평점 데이터 로드
movies = pd.read_csv("data/movies.csv")
ratings = pd.read_csv("data/ratings.csv")

# 영화와 평점 데이터를 병합하여 영화 이름과 장르를 쉽게 조회할 수 있도록 준비
movie_ratings = pd.merge(movies, ratings, left_on='movieId', right_on='movieId')

# 영화 ID를 입력받아 영화 이름과 장르를 반환하는 함수
def getMovieName(movie_ratings, movieID):
    return movie_ratings[movie_ratings["movieId"] == movieID][["title", "genres"]].values[0]

# 영화 이름을 입력받아 영화 ID와 장르를 반환하는 함수
def getMovieID(movie_ratings, movieName):
    return movie_ratings[movie_ratings["title"] == movieName][["movieId", "genres"]].values[0]

# Surprise Reader를 통해 ratings.csv 파일을 Surprise 데이터 형식으로 로드
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file("data/ratings.csv", reader=reader)

# 데이터셋을 훈련 세트로 변환 (모든 데이터를 훈련에 사용)
trainSet = data.build_full_trainset()

# 유사도 계산 옵션 설정 - 아이템 기반과 코사인 유사도 사용
sim_options = {
    'name': 'cosine',    # 유사도 측정 방식: 코사인 유사도
    'user_based': False  # False로 설정하여 아이템 기반 협업 필터링 사용 (사용자 기반을 사용하려면 True로 설정)
}

# 아이템 기반 코사인 유사도 모델 생성 및 학습
model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)

# 아이템 간 유사도 행렬 계산
simsMatrix = model.compute_similarities()

# 추천을 받을 사용자 ID와 고려할 이웃 수 설정
testUserId = '85'
k = 10

# Surprise 내부 ID로 사용자 ID 변환
testUserInnerID = trainSet.to_inner_uid(testUserId)

# 해당 사용자가 평가한 영화 중 평점이 높은 상위 k개 영화를 선택하여 kNeighbors에 저장
testUserRatings = trainSet.ur[testUserInnerID]
kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])

# 사용자가 좋아한 영화와 유사한 영화를 찾기 위해 가중치 합산
candidates = defaultdict(float)
for itemID, rating in kNeighbors:
    # 각 이웃 영화의 유사도 행을 가져와서 유사도 점수와 사용자의 평점을 곱해 가중치 합산
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        candidates[innerID] += score * (rating)

# 사용자가 이미 본 영화를 기록하여 중복 추천 방지
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
    watched[itemID] = 1

# candidates에서 가중치가 높은 순으로 정렬하고, 본 적 없는 영화 중 상위 10개를 추천
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=lambda t: t[1], reverse=True):
    if itemID not in watched:  # 이미 평가한 영화는 제외
        movieID = trainSet.to_raw_iid(itemID)  # 내부 ID를 원래 영화 ID로 변환
        print(movieID, getMovieName(movie_ratings, int(movieID)), ratingSum)
        pos += 1
        if pos > 10:
            break


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
3136 ['James Dean Story, The (1957)' 'Documentary'] 50.0
2607 ['Get Real (1998)' 'Drama|Romance'] 49.93620560356324
259 ['Kiss of Death (1995)' 'Crime|Drama|Thriller'] 49.83440938875971
998 ['Set It Off (1996)' 'Action|Crime'] 49.818661078285594
1935 ['How Green Was My Valley (1941)' 'Drama|Musical|Romance'] 49.71992040532635
1440 ['Amos & Andrew (1993)' 'Comedy'] 49.69868472501265
269 ['My Crazy Life (Mi vida loca) (1993)' 'Drama'] 49.691452437730206
988 ['Grace of My Heart (1996)' 'Comedy|Drama'] 49.631279483226095
2068 ['Fanny and Alexander (Fanny och Alexander) (1982)'
 'Drama|Fantasy|Mystery'] 49.628498357279526
896 ['Wild Reeds (Les roseaux sauvages) (1994)' 'Drama'] 49.58113202209386
2626 ['Edge of Seventeen (1998)' 'Comedy|Drama|Romance'] 49.565143823458385
