#### 1) 데이터 준비와 전처리

Movielens 데이터는 rating.dat 안에 이미 인덱싱까지 완료된 사용자-영화-평점 데이터가 깔끔하게 정리되어 있습니다.

In [1]:
import pandas as pd
import os
from pathlib import Path

# rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'

rating_file_path = Path.joinpath(Path.cwd(),'data','ml-1m','ratings.dat')

ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
# 3점 이상만 남깁니다.
ratings_over3 = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings_over3)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'rating':'counts'}, inplace=True)

In [4]:
ratings = ratings.dropna()

In [5]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000204    1
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 1000209, dtype: int64

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
# movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
movie_file_path = Path.joinpath(Path.cwd(),'data','ml-1m','movies.dat')

cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


여기까지가 전처리입니다. 이후에는 이전 스텝에 소개했던 것과 동일한 방식으로 MF model을 구성하여 내가 좋아할 만한 영화를 추천해 볼 수 있습니다.

In [7]:
movies['title'] = movies['title'].str.lower() # 검색을 쉽게 하기 위해 영화제목 문자열을 소문자로 바꿔줍시다.
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),Animation|Children's|Comedy
1,2,jumanji (1995),Adventure|Children's|Fantasy
2,3,grumpier old men (1995),Comedy|Romance
3,4,waiting to exhale (1995),Comedy|Drama
4,5,father of the bride part ii (1995),Comedy


#### 2) 분석해 봅시다.  

ratings에 있는 유니크한 영화 개수

In [8]:
ratings['movie_id'].nunique()

3706

In [9]:
ratings['movie_id']

0          1193
1           661
2           914
3          3408
4          2355
           ... 
1000204    1091
1000205    1094
1000206     562
1000207    1096
1000208    1097
Name: movie_id, Length: 1000209, dtype: int64

ratings에 있는 유니크한 사용자 수

In [10]:
ratings['user_id'].nunique()

6040

In [11]:
ratings.columns

Index(['user_id', 'movie_id', 'counts', 'timestamp'], dtype='object')

In [14]:
movies = movies.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)

In [15]:
movies['movie_id']

0          1
1          2
2          3
3          4
4          5
        ... 
3878    3948
3879    3949
3880    3950
3881    3951
3882    3952
Name: movie_id, Length: 3883, dtype: int64

In [16]:
ratings.sort_values('movie_id')

Unnamed: 0,user_id,movie_id,counts,timestamp
427702,2599,1,4,973796689
1966,18,1,4,978154768
683688,4089,1,5,965428947
596207,3626,1,4,966594018
465902,2873,1,5,972784317
...,...,...,...,...
84701,551,3952,4,976067330
253845,1544,3952,4,974742620
180689,1130,3952,3,975593522
35180,238,3952,4,976760112


가장 인기 있는 영화 30개(인기순)

In [17]:
movies_count = ratings.groupby('movie_id')['user_id'].count()
movies_count.sort_values(ascending=False).head(30)

movie_id
2858    3428
260     2991
1196    2990
1210    2883
480     2672
2028    2653
589     2649
2571    2590
1270    2583
593     2578
1580    2538
1198    2514
608     2513
2762    2459
110     2443
2396    2369
1197    2318
527     2304
1617    2288
1265    2278
1097    2269
2628    2250
2997    2241
318     2227
858     2223
356     2194
2716    2181
296     2171
1240    2098
1       2077
Name: user_id, dtype: int64

#### 3) 내가 선호하는 영화를 5가지 골라서 ratings에 추가해 줍시다.

In [18]:
movie_title_to_id =  dict(zip(movies['title'], movies['movie_id']))

In [19]:
movie_title_to_id['toy story (1995)']

1

In [21]:
my_favorite  = ['Sabrina (1954)' , 'Waiting to Exhale (1995)' ,'cutthroat island (1995)' ,'Jumanji (1995)' ,'Toy Story (1995)']
my_favorite  = pd.Series(my_favorite)
my_favorite  = my_favorite.str.lower()
my_favorite  = list(map(lambda my_favorite: movie_title_to_id[my_favorite], my_favorite))
my_favorite

my_playlist  = pd.DataFrame({'user_id': ['zimin']*5, 'movie_id': my_favorite, 'counts':[4]*5})

if not ratings.isin({'user_id':['zimin']})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    ratings  = ratings.append(my_playlist)         # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 
    
#     ratings = pd.concat(ratings, my_playlist)
    
ratings.tail(20)       # 잘 추가되었는지 확인해 봅시다.


Unnamed: 0,user_id,movie_id,counts,timestamp
1000194,6040,1077,5,964828799.0
1000195,6040,1079,2,956715648.0
1000196,6040,549,4,956704746.0
1000197,6040,2020,3,956715288.0
1000198,6040,2021,3,956716374.0
1000199,6040,2022,5,956716207.0
1000200,6040,2028,5,956704519.0
1000201,6040,1080,4,957717322.0
1000202,6040,1089,4,956704996.0
1000203,6040,1090,3,956715518.0


In [22]:
user_unique = ratings['user_id'].unique()
user_unique

array([1, 2, 3, ..., 6039, 6040, 'zimin'], dtype=object)

In [23]:
# 고유한 유저, movie_id를 찾아내는 코드
user_unique = ratings['user_id'].unique()
movie_id_unique = ratings['movie_id'].unique()

# 유저, movie_id indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx        = {v:k for k,v in enumerate(user_unique)}
movie_id_to_idx    = {v:k for k,v in enumerate(movie_id_unique)}

In [24]:
movie_id_unique

array([1193,  661,  914, ..., 2845, 3607, 2909])

In [26]:
user_to_idx[1]

0

In [27]:
# 인덱싱이 잘 되었는지 확인해 봅니다. 
print(user_to_idx['zimin'])    # 6040명의 유저 중 마지막으로 추가된 유저이니 6040이 나와야 합니다. 
# print(movie_id_to_idx[1])
print(movie_title_to_id['sabrina (1954)'])
print(movie_id_to_idx[movie_title_to_id['sabrina (1954)']])

6040
915
645


In [28]:
ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp
1000204,6040,1091,1,956716541.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,zimin,915,4,
1,zimin,4,4,
2,zimin,15,4,
3,zimin,2,4,
4,zimin,1,4,


In [29]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# movie_id_to_idx 통해 movie_id 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_id_data = ratings['movie_id'].map(movie_id_to_idx.get).dropna()
if len(temp_movie_id_data) == len(ratings):
    print('movie_id column indexing OK!!')
    ratings['movie_id'] = temp_movie_id_data
else:
    print('movie_id column indexing Fail!!')
    
ratings.reset_index()    

ratings

user_id column indexing OK!!
movie_id column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,timestamp
0,0,0,5,978300760.0
1,0,1,3,978302109.0
2,0,2,3,978301968.0
3,0,3,4,978300275.0
4,0,4,5,978824291.0
...,...,...,...,...
0,6040,645,4,
1,6040,450,4,
2,6040,1850,4,
3,6040,573,4,


#### 4) CSR matrix를 직접 만들어 봅시다.

In [30]:
# 실습 위에 설명보고 이해해서 만들어보기
from scipy.sparse import csr_matrix

In [31]:
num_user  = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

In [32]:
ratings = ratings.reset_index()

In [33]:
ratings

Unnamed: 0,index,user_id,movie_id,counts,timestamp
0,0,0,0,5,978300760.0
1,1,0,1,3,978302109.0
2,2,0,2,3,978301968.0
3,3,0,3,4,978300275.0
4,4,0,4,5,978824291.0
...,...,...,...,...,...
1000209,0,6040,645,4,
1000210,1,6040,450,4,
1000211,2,6040,1850,4,
1000212,3,6040,573,4,


In [34]:
del ratings['index']

In [35]:
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp
0,0,0,5,978300760.0
1,0,1,3,978302109.0
2,0,2,3,978301968.0
3,0,3,4,978300275.0
4,0,4,5,978824291.0
...,...,...,...,...
1000209,6040,645,4,
1000210,6040,450,4,
1000211,6040,1850,4,
1000212,6040,573,4,


In [36]:
ratings.user_id

0             0
1             0
2             0
3             0
4             0
           ... 
1000209    6040
1000210    6040
1000211    6040
1000212    6040
1000213    6040
Name: user_id, Length: 1000214, dtype: int64

In [37]:
num_user

6041

In [38]:
ratings.movie_id

0             0
1             1
2             2
3             3
4             4
           ... 
1000209     645
1000210     450
1000211    1850
1000212     573
1000213      40
Name: movie_id, Length: 1000214, dtype: int64

In [39]:
num_movie

3706

In [40]:
ratings['movie_id'].nunique()

3706

In [41]:
ratings.user_id

0             0
1             0
2             0
3             0
4             0
           ... 
1000209    6040
1000210    6040
1000211    6040
1000212    6040
1000213    6040
Name: user_id, Length: 1000214, dtype: int64

In [42]:
ratings.counts

0          5
1          3
2          3
3          4
4          5
          ..
1000209    4
1000210    4
1000211    4
1000212    4
1000213    4
Name: counts, Length: 1000214, dtype: int64

In [43]:
csr_data =   csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

<6041x3706 sparse matrix of type '<class 'numpy.int64'>'
	with 1000214 stored elements in Compressed Sparse Row format>

#### 5) als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

In [44]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [45]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [46]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3706x6041 sparse matrix of type '<class 'numpy.int64'>'
	with 1000214 stored elements in Compressed Sparse Column format>

In [47]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [48]:
zimin, sabrina = user_to_idx['zimin'], movie_id_to_idx[movie_title_to_id['sabrina (1954)']]
zimin_vector, sabrina_vector = als_model.user_factors[zimin], als_model.item_factors[sabrina]

In [51]:
als_model.user_factors[zimin]

array([-0.415579  ,  0.09914095, -0.07285866,  0.2159752 , -0.8448624 ,
        0.04693242, -0.46531796, -0.02885746,  0.27903765, -0.2209445 ,
        0.24412142, -0.24816915,  0.24534489,  0.1153121 ,  0.22311722,
        0.25533685,  0.6214079 , -0.6064442 ,  0.19093452,  0.25429815,
        0.05938752, -0.17672414, -0.00663657,  0.10558166, -0.19026534,
       -0.6020131 , -0.11732005,  0.20597613,  0.3339805 , -0.17033839,
       -0.10380894,  0.17454806,  0.18547224,  0.48091567,  0.38145623,
       -0.01669823,  0.24340007, -0.29088518, -0.40843174, -0.100341  ,
       -0.61481154,  0.49914074, -0.23938826,  0.189193  ,  0.127922  ,
       -0.29909292,  0.3101445 , -0.07295581, -0.44404623,  0.648082  ,
        0.03869737,  0.17993745, -0.27172282,  0.01816912,  0.1628603 ,
       -0.1398629 ,  0.7975456 , -0.47294417,  0.6445618 ,  0.13334069,
       -0.11095844,  0.4980758 , -0.35643283,  0.3977574 , -0.40405074,
        0.13124399, -0.05187433,  0.29531762,  0.3556215 ,  0.23

In [53]:
zimin_vector

array([-0.415579  ,  0.09914095, -0.07285866,  0.2159752 , -0.8448624 ,
        0.04693242, -0.46531796, -0.02885746,  0.27903765, -0.2209445 ,
        0.24412142, -0.24816915,  0.24534489,  0.1153121 ,  0.22311722,
        0.25533685,  0.6214079 , -0.6064442 ,  0.19093452,  0.25429815,
        0.05938752, -0.17672414, -0.00663657,  0.10558166, -0.19026534,
       -0.6020131 , -0.11732005,  0.20597613,  0.3339805 , -0.17033839,
       -0.10380894,  0.17454806,  0.18547224,  0.48091567,  0.38145623,
       -0.01669823,  0.24340007, -0.29088518, -0.40843174, -0.100341  ,
       -0.61481154,  0.49914074, -0.23938826,  0.189193  ,  0.127922  ,
       -0.29909292,  0.3101445 , -0.07295581, -0.44404623,  0.648082  ,
        0.03869737,  0.17993745, -0.27172282,  0.01816912,  0.1628603 ,
       -0.1398629 ,  0.7975456 , -0.47294417,  0.6445618 ,  0.13334069,
       -0.11095844,  0.4980758 , -0.35643283,  0.3977574 , -0.40405074,
        0.13124399, -0.05187433,  0.29531762,  0.3556215 ,  0.23

In [54]:
sabrina_vector

array([ 6.39231689e-03,  9.36268084e-03,  9.55918524e-03,  3.13794496e-03,
       -4.07870300e-03, -3.70096695e-03, -2.46114731e-02,  6.61582965e-03,
        4.19848179e-03, -2.01949179e-02,  1.46958753e-02, -5.45873959e-03,
        5.10229787e-04,  1.52479811e-02,  7.42600719e-03,  4.99919523e-03,
        3.19432118e-03, -1.01847472e-02,  2.70094224e-05,  1.17001114e-02,
        6.48251362e-03, -2.14509778e-02,  3.49606969e-03,  2.06191987e-02,
       -3.25437961e-03,  1.88744012e-02, -7.65702594e-03,  1.73578144e-03,
        2.60403496e-03, -8.72497540e-03,  8.29102192e-03,  4.62244963e-03,
        3.86796985e-03,  4.30863304e-03,  2.97019240e-02,  3.84082529e-03,
        1.07453447e-02,  2.13359166e-02,  5.96482074e-03,  1.04520358e-02,
       -2.02097953e-03,  2.25033369e-02,  5.97243896e-03,  5.39185712e-03,
        1.31504303e-02, -5.91707462e-03,  7.70954462e-03, -1.41103221e-02,
       -7.30432943e-03,  1.08669447e-02, -6.09872513e-04, -2.94989208e-03,
        2.12020278e-02, -

In [55]:
# zimin과 sabrina_vector를 내적하는 코드
np.dot(zimin_vector, sabrina_vector)

0.14354412

#### 6) 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.

선호하는 5가지 영화 중 하나. 훈련된 모델이 예측한 나의 선호도

In [58]:
zimin, toy_story = user_to_idx['zimin'], movie_id_to_idx[movie_title_to_id['toy story (1995)']]
zimin_vector, toy_story_vector = als_model.user_factors[zimin], als_model.item_factors[toy_story]
np.dot(zimin_vector, toy_story_vector)


0.38123405

 그 외의 영화 하나. 훈련된 모델이 예측한 나의 선호도

In [59]:
zimin, father_of_the_bride = user_to_idx['zimin'], movie_id_to_idx[movie_title_to_id['father of the bride part ii (1995)']]
zimin_vector, father_of_the_bride_vector = als_model.user_factors[zimin], als_model.item_factors[father_of_the_bride]
np.dot(zimin_vector, father_of_the_bride_vector)

0.0029235017

#### 7) 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.

In [60]:
similar_movies = als_model.similar_items(cutthroat_island, N=15)
similar_movies

[(1850, 1.0),
 (1074, 0.77192384),
 (1044, 0.7331026),
 (1870, 0.7309706),
 (1829, 0.7261599),
 (2811, 0.7242216),
 (2634, 0.7220191),
 (3352, 0.7163322),
 (1863, 0.7153462),
 (2650, 0.7122095),
 (3186, 0.7108939),
 (2643, 0.70742434),
 (1350, 0.70126784),
 (2945, 0.69717515),
 (3060, 0.69674367)]

In [61]:
#movie_id_to_idx 를 뒤집어, index로부터 artist 이름을 얻는 dict를 생성합니다. 
idx_to_movie_id = {v:k for k,v in movie_id_to_idx.items()}
id_to_movie_title = {v:k for k,v in movie_title_to_id.items()}

[id_to_movie_title[idx_to_movie_id[i[0]]] for i in similar_movies]


['cutthroat island (1995)',
 'kull the conqueror (1997)',
 'avengers, the (1998)',
 'shadow, the (1994)',
 'chain reaction (1996)',
 'tarzan and the lost city (1998)',
 'fled (1996)',
 'in the line of duty 2 (1987)',
 'radioland murders (1994)',
 'black dog (1998)',
 'knightriders (1981)',
 'firestorm (1998)',
 'substitute, the (1996)',
 'firewalker (1986)',
 'hush (1998)']

#### 8) 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.

In [62]:
user = user_to_idx['zimin']
# recommend에서는 user*item CSR Matrix를 받습니다.
movies_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movies_recommended

[(50, 0.35083348),
 (4, 0.24842732),
 (514, 0.23030873),
 (82, 0.17934075),
 (852, 0.1778692),
 (381, 0.17560995),
 (741, 0.16094482),
 (187, 0.15413883),
 (606, 0.15281506),
 (648, 0.15055445),
 (656, 0.14401403),
 (116, 0.14001471),
 (642, 0.13986404),
 (33, 0.13952255),
 (723, 0.13489154),
 (939, 0.12676008),
 (19, 0.124416724),
 (568, 0.12364999),
 (1066, 0.12136168),
 (808, 0.11824116)]

In [63]:
[id_to_movie_title[idx_to_movie_id[i[0]]] for i in movies_recommended]

['toy story 2 (1999)',
 "bug's life, a (1998)",
 'mask, the (1994)',
 "breakfast at tiffany's (1961)",
 'indian in the cupboard, the (1995)',
 'babe (1995)',
 'willy wonka and the chocolate factory (1971)',
 'dragonheart (1996)',
 'santa clause, the (1994)',
 'roman holiday (1953)',
 'hook (1991)',
 'groundhog day (1993)',
 'beetlejuice (1988)',
 'aladdin (1992)',
 'philadelphia story, the (1940)',
 'iron giant, the (1999)',
 'big (1988)',
 'teenage mutant ninja turtles (1990)',
 'toys (1992)',
 'nutty professor, the (1996)']

#### 후기  
집에 있는 파이썬이 오동작을 일으켜서 시간이 많이 걸렸다. 다 밀어버릴거다.  
데이터에 있는 인덱스를 사용하려고 해봤는데 오류가 떴다. 인덱스에 인덱스를 메겨서 돌려야 했다.
