# Ex13. 아이유팬이 좋아할 만한 다른 아티스트 찾기

In [1]:
import numpy as np
import scipy
import implicit
import pandas as pd

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


## Data Preprocessing

In [2]:
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings','timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'count'}, inplace=True)

In [5]:
ratings['count']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: count, Length: 836478, dtype: int64

In [6]:
ratings.head()

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
ratings = ratings[['user_id', 'movie_id', 'count']]
ratings.head()

Unnamed: 0,user_id,movie_id,count
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [8]:
print('rating에 있는 유니크한 영화 수: ',ratings['movie_id'].nunique())
print('rating에 있는 유니크한 사용자 수: ',ratings['user_id'].nunique())

rating에 있는 유니크한 영화 수:  3628
rating에 있는 유니크한 사용자 수:  6039


In [9]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
#가장 인기있는 영화 30편
movie_data = pd.merge(ratings, movies)
movie_count = movie_data.groupby('title')['count'].count()
movie_count.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  2257
Pr

### Adding Ratings

In [11]:
my_favorite = ['Terminator, The (1984)' , 'Back to the Future (1985)' ,'Matrix, The (1999)' ,'Men in Black (1997)' ,'Jurassic Park (1993)']
favorite_movie_id = movies[movies['title'].isin(my_favorite)]
my_movie = pd.DataFrame({'user_id': [6041]*5, 'movie_id': favorite_movie_id['movie_id'], 'count':[5]*5})

if not ratings.isin({'user_id':[6041]})['user_id'].any():
    ratings = ratings.append(my_movie)
ratings.tail(10)

Unnamed: 0,user_id,movie_id,count
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4
476,6041,480,5
1220,6041,1240,5
1250,6041,1270,5
1539,6041,1580,5
2502,6041,2571,5


In [12]:
movie_data = pd.merge(ratings, movies, on='movie_id')
movie_data

Unnamed: 0,user_id,movie_id,count,title,genre
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...
836478,5851,3607,5,One Little Indian (1973),Comedy|Drama|Western
836479,5854,3026,4,Slaughterhouse (1987),Horror
836480,5854,690,3,"Promise, The (Versprechen, Das) (1994)",Romance
836481,5938,2909,4,"Five Wives, Three Secretaries and Me (1998)",Documentary


In [13]:
user_count = movie_data.groupby('user_id')['title'].count()
user_count.describe()

count    6040.000000
mean      138.490563
std       156.238108
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: title, dtype: float64

### Making CSR Matrix

In [14]:
min_u = min(movie_data.user_id)
max_u = max(movie_data.user_id)
min_m = min(movie_data.movie_id)
max_m = max(movie_data.movie_id)
print(min_u,',',max_u)
print(min_m,',',max_m)

1 , 6041
1 , 3952


In [15]:
from scipy.sparse import csr_matrix
csr_data = csr_matrix((ratings['count'],(ratings.user_id, ratings.movie_id)),shape=(max_u+1,max_m+1))
csr_data

<6042x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

### Design A Model and Training

In [16]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [17]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [18]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6042 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [19]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

### Finding Similar Movies with My Favorites

In [20]:
df = movie_data[movie_data['title'].str.contains("Terminator")]
df

Unnamed: 0,user_id,movie_id,count,title,genre
89416,2,589,4,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
89417,7,589,5,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
89418,8,589,5,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
89419,10,589,4,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
89420,13,589,5,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
...,...,...,...,...,...
193098,6035,1240,3,"Terminator, The (1984)",Action|Sci-Fi|Thriller
193099,6036,1240,5,"Terminator, The (1984)",Action|Sci-Fi|Thriller
193100,6037,1240,4,"Terminator, The (1984)",Action|Sci-Fi|Thriller
193101,6040,1240,3,"Terminator, The (1984)",Action|Sci-Fi|Thriller


In [21]:
my_vector, terminator_vector = als_model.user_factors[6041], als_model.item_factors[1240]

In [22]:
print('my_vector: ',my_vector)
print('terminator: ', terminator_vector)
print('유사도: ', np.dot(my_vector,terminator_vector))

my_vector:  [-1.84164613e-01  3.91451299e-01 -4.01384801e-01  4.84457836e-02
 -1.40619826e+00 -7.14237273e-01  1.88788697e-01  1.53337017e-01
  3.55165094e-01  4.40868200e-04 -8.13089192e-01  3.01763028e-01
  8.63341331e-01  4.70613241e-01  1.00413866e-01 -1.23790443e+00
  7.46914625e-01 -2.77576059e-01  1.16243911e+00 -6.27220154e-01
  6.18505001e-01 -1.56877577e-01  1.40834779e-01 -9.13428485e-01
 -8.56731176e-01 -2.72471309e-01 -4.03173000e-01 -5.35651088e-01
  2.34724611e-01 -3.08653653e-01  6.26013637e-01  4.37462889e-02
 -8.86042476e-01 -4.38125432e-01 -6.38849914e-01 -2.25898683e-01
 -4.08023506e-01  2.21326604e-01  2.42539883e-01 -8.48560452e-01
 -7.18044210e-03  3.17291200e-01 -2.67096072e-01  1.70913115e-01
  4.83450741e-01  1.11075807e+00 -2.69407183e-01 -4.41315264e-01
  4.20210302e-01  3.97920161e-01  5.54737389e-01  3.33415240e-01
 -6.98359832e-02  2.74872839e-01  9.52430546e-01 -1.01332903e-01
 -5.89526057e-01  6.07987761e-01 -2.36201569e-01  1.26168919e+00
 -6.87598765e

### Movies which I Seems to Like

In [23]:
favorite_movie = 'Terminator, The (1984)'
movie_id = movies[movies['title']=='Terminator, The (1984)']['movie_id']
similar_movie = als_model.similar_items(movie_id.values[0], N=15)
similar_movie

[(1240, 1.0000001),
 (1200, 0.7831338),
 (3527, 0.6868445),
 (1036, 0.6806122),
 (1214, 0.6790221),
 (589, 0.6359807),
 (2571, 0.5678695),
 (2916, 0.54568106),
 (541, 0.54251134),
 (3703, 0.52071834),
 (1196, 0.51920503),
 (2985, 0.5079465),
 (1291, 0.48079813),
 (3702, 0.45848703),
 (260, 0.45093262)]

In [24]:
movies[movies['movie_id'].isin([s[0] for s in similar_movie])]

Unnamed: 0,movie_id,title,genre
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
537,541,Blade Runner (1982),Film-Noir|Sci-Fi
585,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
1023,1036,Die Hard (1988),Action|Thriller
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1182,1200,Aliens (1986),Action|Sci-Fi|Thriller|War
1196,1214,Alien (1979),Action|Horror|Sci-Fi|Thriller
1220,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller
1271,1291,Indiana Jones and the Last Crusade (1989),Action|Adventure
2502,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller


### The Movies Recommended

In [25]:
user = 6041
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(589, 0.7460135),
 (2916, 0.50789046),
 (260, 0.39014906),
 (110, 0.37571567),
 (1196, 0.35346812),
 (1573, 0.31376824),
 (1200, 0.3094172),
 (1214, 0.3087144),
 (1210, 0.30687362),
 (3175, 0.29853278),
 (457, 0.28937313),
 (1197, 0.28109008),
 (1527, 0.27736014),
 (780, 0.2720428),
 (1097, 0.26191786),
 (541, 0.25611365),
 (1544, 0.24620688),
 (1198, 0.2402896),
 (32, 0.2335844),
 (2529, 0.23324537)]

In [26]:
movies[movies['movie_id'].isin([m[0] for m in movie_recommended])]

Unnamed: 0,movie_id,title,genre
31,32,Twelve Monkeys (1995),Drama|Sci-Fi
108,110,Braveheart (1995),Action|Drama|War
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
453,457,"Fugitive, The (1993)",Action|Thriller
537,541,Blade Runner (1982),Film-Noir|Sci-Fi
585,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
770,780,Independence Day (ID4) (1996),Action|Sci-Fi|War
1081,1097,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1179,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance


## 고객 중심의 협력적 정화방법의 선호도 계산

# $\hat{r}_{ui}=\frac{\sum_{v \in N_{i}(u)}r_{vi}}{N_{i}(u)}$ 
$N_{i}(u)$ : 영화 i에 대해 유저 u와 선호도가 유사한 유저들의 집합,
$r_{vi}$ : $N_{i}(u)$에 속하는 유저 v의 영화 i에 대한 선호도,  
$\hat{r}_{ui}$ : 영화 i에 대한 유저 u의 선호도

In [27]:
cnt= 0  # "Terminator, The (1984)"(=영화 i)에 선호도를 가진 모든 유저의 수 
for title in movie_data['title'] :
    if (title=='Terminator, The (1984)') :
        cnt+=1
cnt        

2020

In [28]:
score=0 # "Terminator, The (1984)"에 선호도를 가진 유저들이 매긴 총 별점
for count, title in zip(movie_data['count'],movie_data['title']) :
    if (title=='Terminator, The (1984)') :
        score += count
score        

8576

In [29]:
print('Preference of the persons who like "Terminator, The (1984)":', 8576/2020)

Preference of the persons who like "Terminator, The (1984)": 4.245544554455446


## 품목 중심의 협력적 정화방법의 선호도 계산

# $\hat{r}_{iu}=\frac{\sum_{j \in N_{u}(i)}r_{ju}}{N_{u}(i)}$ 
$N_{u}(i)$ : 고객 u의 품목 i에 대한 선호도가 유사한 다른 품목의 집합,
$r_{ju}$ : $N_{u}(i)$ 에 속하는 품목 j에 대한 고객 u에 대한 선호도 \
$\hat{r}_{iu}$ : 품목 i에 대한 선호도

In [30]:
print(sum(movie_data[movie_data['user_id']==2022]['count'])) # id가 2022인 유저가 선호하는 영화들(=j)에 매긴 총 별점
print(len(movie_data[movie_data['user_id']==2022])) # id가 2022인 유저가 선호하는 영화들의 수
#2022 대신 임의의 유저 id를 넣어 관찰해 볼 수 있음.

712
175


In [31]:
print('Preference of the movies which user(id=2022) like is:', 712/175)

Preference of the movies which user(id=2022) like is: 4.0685714285714285


<회고>
* 데이터 전처리 시 데이터 프레임을 만들 때는 그 후의 사용을 위해 주의를 요함을 알았다.
* 서로 다른 데이터셋의 열들을 관계시키기 위해 merge 메소드가 필요함을 알았다.
* my_vector와 terminator_vector사이의 유사도가 0.5 내외로 적당히 이루어졌고  
  그 결과 추천된 영화들은 비슷한 장르의 액션물, 스릴러물, SF물 이었다.
* 빅데이터에서 나오는 추천 시스템 이론을 이용하여 협력적 정화방법에 의한 선호도 계산을  
  해보았다. 
* "Terminator, The (1984)" 영화에 대한 유저들의 선호도와  
  id가 2022인 특정 고객이 좋아하는 영화들의 선호도가 모두  
  5 초과로는 안나오는 걸로 봐서 정상치인듯 하다.  