# Movie 추천

# Surprise 모듈

```
pip install scikit-surprise 
```

TMDB 데이터 이용

In [2]:
import pandas as pd 
from surprise import SVD, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split

- SVD 말그대로 SVD를 통한 행렬 분해를 시키는 함수 -> 객체는 predict, fit, test

- Dataset은 surprise 모듈에서 사용가능하도록 데이터를 불러오는 함수

- accuracy는 정확도를 구하는 함수

- Reader는 데이터를 읽을 때 어떻게 읽을지를 명시하는 함수

In [8]:
ratings = pd.read_csv('./data/ratings.csv')
print(ratings.head())

reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(df=ratings[['userId', 'movieId', 'rating']], reader=reader)

print(data)

userId  movieId  rating   timestamp
0       1       31     2.5  1260759144
1       1     1029     3.0  1260759179
2       1     1061     3.0  1260759182
3       1     1129     2.0  1260759185
4       1     1172     4.0  1260759205
<surprise.dataset.DatasetAutoFolds object at 0x000001D8B822F5C8>


In [13]:
train, test = train_test_split(data, test_size=0.2, shuffle=True)

In [14]:
# 분해한 행렬을 연산했을 때 trainset 데이터 (평점) 에 가깝도록 행렬을 분해
algo = SVD(n_factors=50, n_epochs=20)

algo.fit(trainset=train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1d8beec3f08>

In [16]:
pred = algo.test(testset=test)

print(pred[:3])

[Prediction(uid=175, iid=4700, r_ui=4.0, est=3.657637245360091, details={'was_impossible': False}), Prediction(uid=435, iid=1230, r_ui=3.0, est=4.533953280519794, details={'was_impossible': False}), Prediction(uid=405, iid=1674, r_ui=4.5, est=4.236288266959995, details={'was_impossible': False})]


In [18]:
# 분해된 행렬을 연산을 해서 예측값을 도출해냄
user = '10'
movie = '200'

pred = algo.predict(user, movie)
print(pred)

user: 10         item: 200        r_ui = None   est = 3.55   {'was_impossible': False}


In [24]:
# 10번 유저에 대해서 모든 영화에 부여할 평점을 예측하고,
# 그 평점을 높은 순으로 정렬
user = '10'
preds = []
for i in ratings['movieId'].unique():
    pred = algo.predict(user, i)
    preds.append((pred.est, pred.iid))

preds.sort(reverse=True)
print(preds[:10])

[(4.48519329799742, 318), (4.466696153737801, 858), (4.4220786102041165, 969), (4.406235416106388, 1221), (4.38875940663805, 1228), (4.367694275997463, 2019), (4.364049040410524, 898), (4.363049942332039, 904), (4.363018554526314, 1203), (4.354704821077713, 501)]


## 찾은 영화 아이디(movieId)를 이용해서 그 영화가 무엇인지 확인하기

In [25]:
movies = pd.read_csv('./data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [27]:
print(movies[movies['movieId'] == 858])

movieId                  title       genres
695      858  Godfather, The (1972)  Crime|Drama
