## 필요 패키지 import

In [57]:
import pandas as pd
import os

## 파일 load 및 전처리
- user_id : 사용자 아이디  
- move_id : 영화 아이디
- rating : 별점
- timestap : 별점 준 시간

In [58]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


- 3점 미만은 선호 하지 않는것으로 생각해 제외하기

In [59]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [60]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [61]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### 소문자로 치환하기

In [62]:
movies['title'] = movies['title'].str.lower()
movies['genre'] = movies['genre'].str.lower()

movies.head(10)

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),animation|children's|comedy
1,2,jumanji (1995),adventure|children's|fantasy
2,3,grumpier old men (1995),comedy|romance
3,4,waiting to exhale (1995),comedy|drama
4,5,father of the bride part ii (1995),comedy
5,6,heat (1995),action|crime|thriller
6,7,sabrina (1995),comedy|romance
7,8,tom and huck (1995),adventure|children's
8,9,sudden death (1995),action
9,10,goldeneye (1995),action|adventure|thriller


## 분석

- ratings에 있는 유니크한 영화 개수

In [63]:
# 유니크한 영화 개수
ratings['movie_id'].nunique()

3628

- ratings에 있는 유니크한 사용자 수

In [64]:
# 유니크한 사용자 수
ratings['user_id'].nunique()

6039

- 가장 인기있는 영화 30개

In [65]:
# 가장 인기 있는 영화 30개
movies_count = ratings.groupby('movie_id')['user_id'].count().sort_values(ascending=False)
for movie_id in movies_count.head(30):
    print(movies.loc[movies['movie_id']==movie_id]['title'])


3142    cry in the dark, a (1988)
Name: title, dtype: object
2841    ennui, l' (1998)
Name: title, dtype: object
2816    guinevere (1999)
Name: title, dtype: object
2647    ghostbusters (1984)
Name: title, dtype: object
2492    true crime (1999)
Name: title, dtype: object
2440    eight days a week (1997)
Name: title, dtype: object
2429    my favorite martian (1999)
Name: title, dtype: object
2404    soul man (1986)
Name: title, dtype: object
2391    texas chainsaw massacre 2, the (1986)
Name: title, dtype: object
2365    down in the delta (1998)
Name: title, dtype: object
2344    clue (1985)
Name: title, dtype: object
2316    home fries (1998)
Name: title, dtype: object
2302    fletch (1985)
Name: title, dtype: object
2245    beloved (1998)
Name: title, dtype: object
2228    what dreams may come (1998)
Name: title, dtype: object
2188    no small affair (1984)
Name: title, dtype: object
2183    hero (1992)
Name: title, dtype: object
2144    waltzes from vienna (1933)
Name: title, dtype:

## 검증을 위한 선호 영화 5가지 넣기
 - 옛날 영화들이라 아무거나 5개 넣자.

In [66]:
# 내가 선호하는 영화 5개
my_favorite = ['little mermaid, the (1989)' , 'cinderella (1950)' ,'sound of music, the (1965)' ,'indiana jones and the temple of doom (1984)' ,'toy story (1995)']

for i in range(5):
    temp = movies[movies['title'] == my_favorite[i]]
    print(temp)

      movie_id                       title  \
2012      2081  little mermaid, the (1989)   

                                            genre  
2012  animation|children's|comedy|musical|romance  
      movie_id              title                         genre
1009      1022  cinderella (1950)  animation|children's|musical
      movie_id                       title    genre
1022      1035  sound of music, the (1965)  musical
      movie_id                                        title             genre
2046      2115  indiana jones and the temple of doom (1984)  action|adventure
   movie_id             title                        genre
0         1  toy story (1995)  animation|children's|comedy


In [67]:
my_favorite = [2081, 1022, 1035, 2115, 1]
my_count = [4.0, 4.0, 5.0, 5.0, 4.0]
my_movielist = pd.DataFrame({'user_id': ['TH']*5, 'movie_id': my_favorite, 'counts': my_count})

if not ratings.isin({'user_id':['sehui']})['user_id'].any():  # user_id에 'sehui'이라는 데이터가 없다면
    ratings = ratings.append(my_movielist)                           # 위에 임의로 만든 my_movielist 데이터를 추가해 줍니다. 

ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3.0,956715518.0
1000205,6040,1094,5.0,956704887.0
1000206,6040,562,5.0,956704746.0
1000207,6040,1096,4.0,956715648.0
1000208,6040,1097,4.0,956715569.0
0,TH,2081,4.0,
1,TH,1022,4.0,
2,TH,1035,5.0,
3,TH,2115,5.0,
4,TH,1,4.0,


## 전처리

In [75]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [77]:
user_to_idx['TH']

6039

In [78]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie_id column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

ratings

user_id column indexing OK!!
movie_id column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,timestamp
0,0,0,5.0,978300760.0
1,0,1,3.0,978302109.0
2,0,2,3.0,978301968.0
3,0,3,4.0,978300275.0
4,0,4,5.0,978824291.0
...,...,...,...,...
0,6039,191,4.0,
1,6039,37,4.0,
2,6039,14,5.0,
3,6039,188,5.0,


In [79]:
ratings.drop(['timestamp'], axis=1, inplace=True)

In [81]:
ratings

Unnamed: 0,user_id,movie_id,counts
0,0,0,5.0
1,0,1,3.0
2,0,2,3.0
3,0,3,4.0
4,0,4,5.0
...,...,...,...
0,6039,191,4.0
1,6039,37,4.0
2,6039,14,5.0
3,6039,188,5.0


## CSR matrix

In [85]:
ratings['counts']

0    5.0
1    3.0
2    3.0
3    4.0
4    5.0
    ... 
0    4.0
1    4.0
2    5.0
3    5.0
4    4.0
Name: counts, Length: 836483, dtype: float64

In [83]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings['counts'], (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.float64'>'
	with 836483 stored elements in Compressed Sparse Row format>

## als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

In [86]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 설정. 학습내용과 무관.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [None]:
# Implicit AlternatingLeastSquares 모델의 선언

als_model = AlternatingLeastSquares(factors=640, regularization=0.01, use_gpu=False, iterations=30, dtype=np.float64)

# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/30 [00:00<?, ?it/s]