# 1) 데이터 준비와 전처리

In [1]:
import os
import pandas as pd

rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)
ratings['count']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: count, Length: 836478, dtype: int64

In [4]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


데이터 결합 및 삭제

In [5]:
movie_ratings = pd.merge(left = ratings,right = movies, on='movie_id')
movie_ratings.drop(columns = ['timestamp','genre'],inplace=True)
movie_ratings

Unnamed: 0,user_id,movie_id,count,title
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975)
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975)
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975)
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975)
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975)
...,...,...,...,...
836473,5851,3607,5,One Little Indian (1973)
836474,5854,3026,4,Slaughterhouse (1987)
836475,5854,690,3,"Promise, The (Versprechen, Das) (1994)"
836476,5938,2909,4,"Five Wives, Three Secretaries and Me (1998)"


# 2) 분석해보기
+ movie_ratings에 있는 유니크한 영화 개수
+ movie_ratings에 있는 유니크한 사용자 수
+ 가장 인기 있는 영화 30개(인기순)

In [6]:
#유니크한 영화개수
movie_ratings['title'].nunique()

3628

In [7]:
#유니크한 사용자 수
movie_ratings['user_id'].nunique()

6039

In [8]:
#가장 인기 있는 영화 30개(인기순)
popular_movie = movie_ratings.groupby('title')['movie_id'].count()
popular_movie.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  2257
Pr

# 3) 내가 선호하는 영화를 다섯가지 골라서 추가

In [9]:
#제목을 고른후 movie_id데이터 확인 
my_favorite_title = ['Men in Black (1997)','Toy Story (1995)','Jumanji (1995)',
                     'Jurassic Park (1993)','Fargo (1996)']

mft = movie_ratings['title'].isin(my_favorite_title)
movie_ratings.loc[mft].drop_duplicates(['title'])

Unnamed: 0,user_id,movie_id,count,title
38620,1,1,5,Toy Story (1995)
53172,1,608,4,Fargo (1996)
101881,2,480,5,Jurassic Park (1993)
166207,3,1580,3,Men in Black (1997)
362365,10,2,5,Jumanji (1995)


In [10]:
my_favorite_id = ['1580','1','2','480','608']

# 'Uikyeong'이라는 user_id가 위 영화를 5번씩 보았다고 가정
my_movielist = pd.DataFrame({'user_id': ['Uikyeong']*5, 'movie_id': my_favorite_id, 'count':[5]*5,'title':my_favorite_title})

if not movie_ratings.isin({'user_id':['Uikyeong']})['user_id'].any():  # user_id에 'Uikyeong'이라는 데이터가 없다면
    movie_ratings = movie_ratings.append(my_movielist,ignore_index=True)       # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

movie_ratings.tail(10)

Unnamed: 0,user_id,movie_id,count,title
836473,5851,3607,5,One Little Indian (1973)
836474,5854,3026,4,Slaughterhouse (1987)
836475,5854,690,3,"Promise, The (Versprechen, Das) (1994)"
836476,5938,2909,4,"Five Wives, Three Secretaries and Me (1998)"
836477,5948,1360,5,Identification of a Woman (Identificazione di ...
836478,Uikyeong,1580,5,Men in Black (1997)
836479,Uikyeong,1,5,Toy Story (1995)
836480,Uikyeong,2,5,Jumanji (1995)
836481,Uikyeong,480,5,Jurassic Park (1993)
836482,Uikyeong,608,5,Fargo (1996)


# 4) CSR matrix를 직접 만들어보기

In [11]:
# 고유한 유저, 영화 찾아내기
user_unique = movie_ratings['user_id'].unique()
movie_unique = movie_ratings['title'].unique()

# 유저, 영화 indexing 
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [12]:
# 6039명에서 'Uikyeong'이 추가되서 6040명이 되었다.
print(user_to_idx['Uikyeong']) 
print(movie_to_idx['Jumanji (1995)']) 

6039
513


In [13]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = movie_ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(movie_ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    movie_ratings['user_id'] = temp_user_data    
else:
    print('user_id column indexing Fail!!')

temp_movie_data = movie_ratings['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(movie_ratings):
    print('movie column indexing OK!!')
    movie_ratings['title'] = temp_movie_data
else:
    print('title column indexing Fail!!')

movie_ratings

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,count,title
0,0,1193,5,0
1,1,1193,5,0
2,2,1193,4,0
3,3,1193,4,0
4,4,1193,5,0
...,...,...,...,...
836478,6039,1580,5,175
836479,6039,1,5,40
836480,6039,2,5,513
836481,6039,480,5,107


In [14]:
from scipy.sparse import csr_matrix

num_user = movie_ratings['user_id'].nunique()
num_movie = movie_ratings['title'].nunique()

csr_data = csr_matrix((movie_ratings['count'], (movie_ratings.user_id, movie_ratings.title)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

# 5) AlternatingLeastSquares모델 구성하여 훈련

In [15]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [16]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [17]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [18]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

# 6)  훈련된 모델이 예측한 나의 영화선호도 파악

In [19]:
# Uikyeong , Toy Story 벡터화
Uikyeong, Toy = user_to_idx['Uikyeong'], movie_to_idx['Toy Story (1995)']
Uikyeong_vector, Toy_vector = als_model.user_factors[Uikyeong], als_model.item_factors[Toy]

In [20]:
Uikyeong_vector # Uikyeong 벡터

array([ 0.2129324 , -0.9449933 ,  0.139504  , -0.26335484,  0.10063476,
        0.3295815 ,  0.36810467, -0.43090755, -0.26291597, -0.12169398,
       -0.4769159 , -0.52907413,  0.53738004, -0.7082889 ,  0.12659998,
       -0.76759946, -1.1045858 , -0.05516865,  0.6255158 ,  0.43811247,
        0.6750725 , -0.02206143, -0.5914389 , -0.41442832,  0.08746058,
        0.9757322 ,  0.6504608 , -0.4702472 , -0.16817969,  0.5728641 ,
       -0.0191352 ,  0.9210207 , -0.0695239 , -0.00258685,  0.28629047,
       -0.40200442,  0.04001674,  0.36028278, -0.12614095,  0.16412953,
        0.55532426, -0.43557167,  0.32468882, -0.2235744 ,  0.6018467 ,
        0.10512763, -0.7490809 ,  0.60921276,  0.393733  ,  1.0650882 ,
       -0.7234057 ,  1.5207807 ,  0.12221246,  0.9396158 ,  0.20771047,
        0.5135881 ,  0.01149686,  0.04660965, -0.10656265, -0.2551693 ,
       -0.22379126, -0.05089863, -0.673612  ,  0.3221029 ,  1.0069587 ,
        0.25478074,  0.5201407 , -0.49541223, -0.71047455, -1.05

In [21]:
Toy_vector # Jumanji 벡터

array([ 7.4998164e-03,  1.0720575e-02,  5.6822654e-03, -9.3449606e-03,
        3.3057384e-02, -1.1743978e-02,  5.1335134e-03, -6.6163251e-03,
        2.7521512e-02, -5.0634039e-03, -8.4656738e-03,  4.3155602e-03,
        3.0297976e-02,  7.3721656e-04,  1.1395493e-02,  6.1713196e-03,
       -2.3622118e-02, -2.1500172e-02,  3.8308967e-03, -9.5130286e-05,
        2.5010515e-02,  5.2053230e-03, -2.9737471e-02, -1.8009640e-02,
        1.5578728e-02,  1.3696864e-02,  2.3523770e-02, -2.6457363e-03,
       -4.9307668e-03,  1.3504117e-02,  1.7750237e-02,  4.1346319e-02,
       -8.0028223e-03,  6.3034790e-03, -3.6451470e-02,  1.4033308e-02,
       -8.4028328e-03,  1.4969508e-02,  1.1439127e-02,  1.6389888e-02,
        1.4670242e-02,  6.3030100e-03,  2.5662631e-02, -1.4610202e-02,
        4.5640790e-03, -1.9787939e-02, -8.2760816e-03,  3.8474962e-02,
       -2.6078651e-02,  3.0509297e-02, -3.6622051e-02,  2.1732459e-02,
        3.0863514e-02,  3.1725515e-02,  2.9788053e-02,  4.9015988e-02,
      

In [22]:
# Uikyeong과 Toy_Story 내적
np.dot(Uikyeong_vector, Toy_vector)

0.4656087

In [23]:
Slaughterhouse = movie_to_idx['Slaughterhouse (1987)']
Slaughterhouse_vector = als_model.item_factors[Slaughterhouse]
np.dot(Uikyeong_vector, Slaughterhouse_vector)

0.0044746003

선호리스트에 없는 영화인 Slaughterhouse와의 내적을 구해 선호도를 확인해보니
0%에 가까운것을 볼 수가 있다.

# 7) 내가 좋아하는 영화와 비슷한 영화 추천

In [24]:
favorite_movie = 'Jumanji (1995)'
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(513, 0.9999998),
 (596, 0.83228236),
 (1130, 0.8202917),
 (173, 0.7407964),
 (1985, 0.6719596),
 (828, 0.6648201),
 (1736, 0.65127045),
 (1982, 0.6501774),
 (545, 0.6256638),
 (1733, 0.6192034),
 (561, 0.5898166),
 (458, 0.57193726),
 (2017, 0.5690134),
 (2009, 0.5526923),
 (576, 0.55183053)]

In [25]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

['Jumanji (1995)',
 'Hook (1991)',
 'Indian in the Cupboard, The (1995)',
 'Dragonheart (1996)',
 'NeverEnding Story II: The Next Chapter, The (1990)',
 'Flubber (1997)',
 'Small Soldiers (1998)',
 'Space Jam (1996)',
 'Santa Clause, The (1994)',
 'Borrowers, The (1997)',
 'NeverEnding Story, The (1984)',
 'Mask, The (1994)',
 'Pagemaster, The (1994)',
 "Kid in King Arthur's Court, A (1995)",
 'Escape to Witch Mountain (1975)']

In [26]:
# 비슷한 영화 찾기
def get_similar_movie(movie_title: str):
    movie_id = movie_to_idx[movie_title]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [27]:
get_similar_movie('Toy Story (1995)')

['Toy Story (1995)',
 'Toy Story 2 (1999)',
 "Bug's Life, A (1998)",
 'Aladdin (1992)',
 'Babe (1995)',
 'Groundhog Day (1993)',
 'Beauty and the Beast (1991)',
 'Lion King, The (1994)',
 'Pleasantville (1998)',
 "There's Something About Mary (1998)"]

In [28]:
get_similar_movie('Men in Black (1997)')

['Men in Black (1997)',
 'Jurassic Park (1993)',
 'Terminator 2: Judgment Day (1991)',
 'Total Recall (1990)',
 'Independence Day (ID4) (1996)',
 'Fifth Element, The (1997)',
 'Matrix, The (1999)',
 'Face/Off (1997)',
 'Lost World: Jurassic Park, The (1997)',
 'Schlafes Bruder (Brother of Sleep) (1995)']

# 8) 내가 가장 좋아할 만한 영화 추천

In [29]:
user = user_to_idx['Uikyeong']
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(92, 0.44610578),
 (50, 0.3988515),
 (322, 0.38010246),
 (124, 0.35039103),
 (4, 0.30947676),
 (121, 0.30873263),
 (62, 0.29841632),
 (60, 0.27050322),
 (99, 0.2613218),
 (150, 0.24235979),
 (276, 0.24048169),
 (80, 0.24020445),
 (58, 0.23295853),
 (284, 0.23100579),
 (157, 0.2293838),
 (82, 0.22461137),
 (458, 0.22263977),
 (222, 0.22236514),
 (38, 0.21682799),
 (110, 0.21524069)]

추천받은 인덱스를 영화제목으로 변환

In [30]:
[(idx_to_movie[i[0]], i[1]) for i in movie_recommended]

[('Terminator 2: Judgment Day (1991)', 0.44610578),
 ('Toy Story 2 (1999)', 0.3988515),
 ('Babe (1995)', 0.38010246),
 ('Matrix, The (1999)', 0.35039103),
 ("Bug's Life, A (1998)", 0.30947676),
 ('Silence of the Lambs, The (1991)', 0.30873263),
 ('Total Recall (1990)', 0.29841632),
 ('Star Wars: Episode I - The Phantom Menace (1999)', 0.27050322),
 ('American Beauty (1999)', 0.2613218),
 ('Independence Day (ID4) (1996)', 0.24235979),
 ('North by Northwest (1959)', 0.24048169),
 ('Stand by Me (1986)', 0.24020445),
 ('Mission: Impossible (1996)', 0.23295853),
 ('Nightmare Before Christmas, The (1993)', 0.23100579),
 ('Shawshank Redemption, The (1994)', 0.2293838),
 ('Lost World: Jurassic Park, The (1997)', 0.22461137),
 ('Mask, The (1994)', 0.22263977),
 ('Pulp Fiction (1994)', 0.22236514),
 ('Sixth Sense, The (1999)', 0.21682799),
 ('Groundhog Day (1993)', 0.21524069)]

내가 기록을 남긴 데이터 중 첫번째 추천영화에 기여한 정도를 확인

In [31]:
Firstrecommend = movie_to_idx['Terminator 2: Judgment Day (1991)']
explain = als_model.explain(user, csr_data, itemid=Firstrecommend)

In [32]:
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[('Jurassic Park (1993)', 0.21973645130138955),
 ('Men in Black (1997)', 0.164503749060072),
 ('Toy Story (1995)', 0.0358996864764883),
 ('Fargo (1996)', 0.026563288565943057),
 ('Jumanji (1995)', -0.008282107922197014)]

---
1. 데이터는 평점과 제목을 같이 볼수있게 merge함수를 써서 결합을 하였고,
   좋아하는 영화를 5개 고르고 movie_id까지 확인을 걸쳐서 알맞게 추가를 하였다.
   추가한것까지 user_id와 title을 바탕으로 CSRmatrix가 잘 생성되었다.

2. 토이스토리를 나의 아이디와 벡터내적값을 확인해보니 0.43 이 나왔고
   전혀 관련이없는 영화와 나의 아이디의 벡터내적값을 보니 0에 가까이 나왔다.
   비교를 해보니 생각보다 내적수치가 유의미있게 형성이 되었다.

3. 비슷한 영화를 찾아보니 토이스토리는 토이스토리2,라이온킹등이 나오는거보니 유사도가    잘나오는것을 볼 수 있었다. 맨인블랙,쥬만지도 확인해본 결과 모든 영화가 비슷해보이    진 않아도 어느정도 비슷한 영화처럼 보이는것을 확인하였다.

   추천받은 영화들을 나열해보니 비슷한 영화라고 나왔던 영화들이 주로 속해있었다.
   uikyeong벡터와의 내적수치가 좋아하는 영화랑 많이 차이가 안나는 영화들인것같았다.
   추천받은 영화 중 1등인 터미네이터2에 어떤 영화가 기여하였는지 목록들을 보니
   정확하게 내가 좋아하는 영화들로만 구성되어있는것을 보니 유의미하게 측정이 되는것을    느꼈다.