## 데이터 준비와 전처리

In [1]:
import os
import pandas as pd
import numpy as np
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix


rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head(10)

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [2]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [4]:
ratings = ratings[['user_id', 'movie_id', 'counts']]
ratings

Unnamed: 0,user_id,movie_id,counts
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 데이터 분석

In [6]:
#ratings에 있는 유니크한 영화 개수
ratings['movie_id'].nunique()

3628

In [7]:
# ratings에 있는 유니크한 사용자 수
ratings['user_id'].nunique()

6039

In [8]:
# 가장 인기있는 영화 30개(인기순)
movie_data = pd.merge(ratings, movies)
movie_count = movie_data.groupby('title')['counts'].count()
movie_count.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  2257
Pr

## 선호하는 영화를 골라서 ratings에 추가

In [9]:
my_favorite = [2008, 2016, 2013, 2010, 2014]
my_favorite_movies = pd.DataFrame({'user_id': [6041]*5,'movie_id': my_favorite, 'counts': [5]*5})

if not ratings.isin({'user_id': [6041]})['user_id'].any():
    ratings = ratings.append(my_favorite_movies)

ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4
0,6041,2008,5
1,6041,2016,5
2,6041,2013,5
3,6041,2010,5
4,6041,2014,5


## CSR matrix 만들기

In [10]:
num_movie = ratings['movie_id'].nunique()
num_user = ratings['user_id'].nunique()

In [11]:
csr_data = csr_matrix((ratings['counts'], (ratings['user_id'], ratings['movie_id'])))
csr_data

<6042x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

## als_model = ALternationgLeastSquares 모델 구성

In [12]:
# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [13]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [14]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6042 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [15]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [16]:
movie_vector, my_favorite_vector = als_model.user_factors[6041], als_model.item_factors[2008]

In [18]:
movie_vector

array([ 3.37716863e-02,  2.61264414e-01, -2.73952395e-01,  1.23338744e-01,
        2.50962377e-01, -2.35927686e-01,  2.00090423e-01,  7.53617659e-02,
       -4.16356981e-01, -4.57327396e-01,  5.61361194e-01, -4.39360321e-01,
        5.12849987e-01,  1.44983426e-01, -2.13726968e-01, -1.97979197e-01,
       -1.78530544e-01, -9.92362380e-01,  1.18893869e-01, -2.93677181e-01,
        4.72320080e-01,  9.36350346e-01,  1.07040547e-01,  2.28509307e-01,
        4.55254549e-03, -2.27900684e-01, -2.68549044e-02,  1.90464556e-01,
       -5.50203063e-02,  3.24142545e-01,  1.43561125e-01, -3.06461126e-01,
       -2.14945018e-01, -2.54977018e-01, -1.06103919e-01,  1.26707450e-01,
       -2.77957499e-01,  2.05013618e-01, -2.39986256e-01, -3.87127310e-01,
       -3.13668728e-01,  1.63824096e-01, -3.63538444e-01,  7.23175332e-02,
       -8.70858207e-02,  1.78091377e-01, -9.71752629e-02, -2.12971509e-01,
       -1.82012673e-02,  1.42404184e-01, -5.43593094e-02,  1.86905190e-02,
        6.27141744e-02,  

In [19]:
my_favorite_vector

array([ 2.3277488e-03,  3.1935235e-03,  2.9893229e-03,  5.9872158e-03,
        3.3087861e-03,  4.1799522e-03,  3.2672025e-03,  6.9321417e-03,
        2.8887419e-03,  1.5428233e-03,  1.1865296e-03,  9.2720281e-04,
        1.4255100e-03,  3.4862366e-03,  5.3158263e-04,  3.6278232e-03,
        2.1813417e-04, -6.2201777e-04,  8.1766935e-05,  6.8063667e-04,
        2.4055343e-03,  3.9189570e-03,  3.7747924e-03, -4.0868274e-04,
        4.9327607e-03,  3.1416665e-03,  1.2136921e-03,  3.8634492e-03,
       -2.2503866e-03,  3.6916931e-03,  2.9942701e-03,  3.0273541e-05,
        4.2926981e-03,  2.4201616e-03,  2.2503876e-03, -6.6465494e-05,
        4.8909318e-03,  5.1922668e-03,  4.2951941e-03,  3.9980081e-03,
       -3.1798547e-03, -2.7590713e-03, -6.8193396e-05,  6.1872597e-03,
        4.4645965e-03,  2.8178990e-03, -5.5998648e-06,  4.2444267e-03,
        1.4545746e-03,  3.1109792e-03,  1.9851462e-03,  2.6996247e-03,
        3.4983503e-03, -4.9276825e-04,  7.6908799e-04,  1.0412930e-03,
      

In [20]:
np.dot(movie_vector, my_favorite_vector)

0.0098220045

In [23]:
# 비슷한 영화 찾기
my_favorite_movie_id = 2010
similar_movie = als_model.similar_items(my_favorite_movie_id, N=15)
similar_movie

[(2010, 1.0000002),
 (1348, 0.65514374),
 (3503, 0.62403494),
 (3658, 0.60382265),
 (3340, 0.59451926),
 (2660, 0.5925284),
 (2661, 0.56625235),
 (1199, 0.56321794),
 (1260, 0.5557135),
 (1301, 0.5452981),
 (1924, 0.5436785),
 (2363, 0.5417392),
 (3659, 0.5358834),
 (2666, 0.53568625),
 (1306, 0.52338)]

In [42]:
#idx_to_movie = {v:k for k,v in movie_to_idx.items()} -> 키가 전부 다 들어가있지 않다.
idx_to_movie = {}
for i in range(len(movies)):
    idx_to_movie[movies['movie_id'][i]] = movies['title'][i]
    

movie_to_idx = {}
for k, v in idx_to_movie.items():
    movie_to_idx[v] = k
    
[idx_to_movie[i[0]] for i in similar_movie]

['Metropolis (1926)',
 'Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922)',
 'Solaris (Solyaris) (1972)',
 'Quatermass and the Pit (1967)',
 'Bride of the Monster (1956)',
 'Thing From Another World, The (1951)',
 'It Came from Outer Space (1953)',
 'Brazil (1985)',
 'M (1931)',
 'Forbidden Planet (1956)',
 'Plan 9 from Outer Space (1958)',
 'Godzilla (Gojira) (1954)',
 'Quatermass II (1957)',
 'It Conquered the World (1956)',
 'Until the End of the World (Bis ans Ende der Welt) (1991)']

In [43]:
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [44]:
get_similar_movie('Toy Story (1995)')

['Toy Story (1995)',
 'Toy Story 2 (1999)',
 'Aladdin (1992)',
 'Groundhog Day (1993)',
 "Bug's Life, A (1998)",
 'Babe (1995)',
 'Lion King, The (1994)',
 'Beauty and the Beast (1991)',
 "There's Something About Mary (1998)",
 'Pleasantville (1998)']

## 회고

비슷한 영화를 찾아서 추천해주는 부분뺴면 크게 어려운 부분은 없었다. 배운것에 대해 활용을 잘 못해서 그런지 해결하는데 시간이 꽤나 걸렸다. 좋아하는 영화를 추가하는 부분에서 위에서처럼 movie_id로 추가해주지 않고 영화이름이랑 연도를 나타내서 추가해보려 했는데 하지 못했다. 일단 완성은 시켜야 하니까. 그 부분에서 거의 하루를 날린거 같다. 구조에 대한 이해도가 부족해서인지 활용하는데 어려움이 많다.