### 협업 필터링(Collaborative Filtering)

In [1]:
import pandas as pd
import numpy as np

u_cols = ["user_id", "age", "sex", "occupation", "zip_code"]
users = pd.read_csv("u.user", sep = "|", names = u_cols, encoding = 'latin-1')

users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [2]:
i_cols = ['movie_id', 'title', 'release date', 'video release date', 
            'IMDB URL', 'unknown', 'Action', 'Adventure', 'Animation',
            'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',
            'Fantasy', 'Film-noir', 'Horror', 'Musical', 'Mistery', 'Romance',
            'Sci-Fi', 'Thriller', 'war', 'westurn']
movies = pd.read_csv("u.item", sep = "|", names = i_cols, encoding = 'latin-1')

movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-noir,Horror,Musical,Mistery,Romance,Sci-Fi,Thriller,war,westurn
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv("u.data", sep = "\t", names = r_cols, encoding = 'latin-1')

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)

# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

In [5]:
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, stratify = y)


In [6]:
# 정확도(RMSE)
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

In [7]:
rating_matrix = x_train.pivot(index = 'user_id', columns = 'movie_id', values = 'rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1667,1668,1669,1671,1673,1674,1675,1676,1680,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,,1.0,,3.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


- 협업 필터링

1) 대상과 취향이 비슷한 사용자를 찾음. -> 각 사용자의 유사성(similarity)을 계산해 비슷한 사용자 그룹 neighbor로 분류  
2) neighbor 사용자가 가장 좋게 평가한 영화를 찾음.  
3) 대상이 아직 보지 않은 아이템을 neighbor 기준으로 평점, 평균을 내어 예측  
4) 해당 아이템을 대상에게 추천  
<br>

#### - 기본 CF 알고리즘

In [8]:
# trian dataset의 모든 가능한 사용자 pair의 Cosine similarities 계산

from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = rating_matrix.copy().fillna(0) # rating값을 matrix_dummy에 복사, NaN은 error를 발생시켜 0으로 변경
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy) # 모든 사용자간의 유사도 계산; 943X943
user_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index, columns = rating_matrix.index) # user_similarity에 index 지정
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.090496,0.042861,0.034665,0.258852,0.292140,0.336797,0.225246,0.025756,0.227712,...,0.296071,0.120853,0.232871,0.120753,0.157563,0.081103,0.212482,0.077521,0.141614,0.261746
2,0.090496,1.000000,0.092486,0.079108,0.000000,0.165545,0.078515,0.079010,0.052193,0.082401,...,0.115150,0.227151,0.246743,0.289394,0.216860,0.136960,0.194441,0.053799,0.151783,0.104630
3,0.042861,0.092486,1.000000,0.233548,0.000000,0.076301,0.052300,0.041163,0.024171,0.055120,...,0.043963,0.058261,0.150726,0.050590,0.081245,0.020682,0.155785,0.087201,0.156142,0.035703
4,0.034665,0.079108,0.233548,1.000000,0.041247,0.065404,0.063203,0.102190,0.059439,0.079243,...,0.068471,0.000000,0.135141,0.183337,0.064932,0.040689,0.176061,0.073521,0.185257,0.035119
5,0.258852,0.000000,0.000000,0.041247,1.000000,0.189023,0.298576,0.173510,0.039100,0.166260,...,0.268036,0.000000,0.054547,0.084077,0.038552,0.012848,0.212759,0.078994,0.135824,0.213200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.081103,0.136960,0.020682,0.040689,0.012848,0.092814,0.123364,0.065311,0.055927,0.083195,...,0.045211,0.359488,0.241969,0.201092,0.384751,1.000000,0.098169,0.131438,0.042908,0.124625
940,0.212482,0.194441,0.155785,0.176061,0.212759,0.250926,0.256217,0.238411,0.100385,0.240244,...,0.232582,0.072015,0.179645,0.187992,0.170721,0.098169,1.000000,0.105322,0.230169,0.137689
941,0.077521,0.053799,0.087201,0.073521,0.078994,0.054688,0.046648,0.096159,0.084214,0.056728,...,0.027231,0.192164,0.227534,0.139525,0.273634,0.131438,0.105322,1.000000,0.100504,0.102359
942,0.141614,0.151783,0.156142,0.185257,0.135824,0.209803,0.207960,0.121996,0.048751,0.166153,...,0.156234,0.050362,0.106845,0.112012,0.069254,0.042908,0.230169,0.100504,1.000000,0.139615


In [9]:
## 주어진 영화의 가중평균 rating 계산
# 가중치는 주어진 사용자와 다른 사용자간의 유사도(user_similarity)

def CF_simple(user_id, movie_id):
    if movie_id in rating_matrix: #주어진 영화가 matrix에 있는가 확인
        
        sim_scores = user_similarity[user_id].copy() # 주어진 사용자와 다른 사용자의 유사도
        movie_ratings = rating_matrix[movie_id].copy() # 주어진 사용자에 대한 모든 사용자의 평점
        
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index # 주어진 영화를 평가하지 않은 사용자의 위치 확인
                                                                      # (가중평균 계산에서 제외)
            
        movie_ratings = movie_ratings.dropna() # 주어진 영화에 대한 평점 중 평가하지 않은 사람의 평점(NaN) 제거
        sim_scores = sim_scores.drop(none_rating_idx) # 주어진 영화를 평가하지 않은 사용자의 유사도 제거(가중평균 계산 필요x)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum() 
        # 주어진 영화에 대해 평가한 각 사용자에 대해 평점을 유사도로 가중평균한 예측치
    else:
        mean_rating = 3.0 #주어진 영화가 matrix에 없는 경우: 3.0
    return mean_rating

score(CF_simple)

1.0214684154196552

-----------------

#### - 이웃을 고려한 CF

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index, columns = rating_matrix.index)

# 정확도(RMSE)
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE
def score(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)


# ----------------------------------------------------------------------------------------------------
## Neighbor size를 정해 예측치를 계산
def cf_knn(user_id, movie_id, neighbor_size = 0):
    if movie_id in rating_matrix: #주어진 영화가 matrix에 있는가 확인
        
        sim_scores = user_similarity[user_id].copy() 
        movie_ratings = rating_matrix[movie_id].copy() 
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index        
        
        movie_ratings = movie_ratings.drop(none_rating_idx) 
        sim_scores = sim_scores.drop(none_rating_idx)
        
        # Neigjbor size가 지정되지 않은 경우
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        
        # Neigjbor size가 지정된 경우
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0 
    return mean_rating


score(cf_knn, neighbor_size = 30)

1.0137715168365204

<br>
<br>

- 주어진 사용자에 대해 추천받기  



In [11]:
# 전체 데이터로 full matrix와 cosine similarity 구하기

rating_matrix = ratings.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id')

from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index, 
                              columns = rating_matrix.index)

def recom_movie(user_id, n_items, neighbor_size = 30): # 주어진 사용자에 대한 아이템 반환
    user_movie = rating_matrix.loc[user_id].copy() # 전체 데이터에서 현재 사용자의 평점 데이터만 복사
    
    for movie in rating_matrix:
        if pd.notnull(user_movie.loc[movie]): # 이미 평가한 경우
            user_movie.loc[movie] = 0 # 추천리스트에서 제외
        else:
            user_movie.loc[movie] = cf_knn(user_id, movie, neighbor_size)
            
    movie_sort = user_movie.sort_values(ascending = False)[:n_items]
    # 예상평점을 기준으로 내림차순 정렬 후, 상위 영화를 추천 아이템 수만큼 뽑아 저장
    recom_movies = movies.loc[movie_sort.index] # 골라진 영화의 정보 추출
    recommendations = recom_movies['title'] # 제목 반환
    return recommendations

recom_movie(user_id = 2, n_items = 5, neighbor_size = 30)

movie_id
1293                     Ayn Rand: A Sense of Life (1997)
1189                              That Old Feeling (1997)
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
1467                                     Cure, The (1995)
318                       Everyone Says I Love You (1996)
Name: title, dtype: object

-----------------

#### - 최적의 이웃 크기 결정

In [12]:
# 최적의 neighbor size 구하기
## train set으로 full matrix와 cosine similarity 구하기

rating_matrix = x_train.pivot_table(values = 'rating', index = 'user_id',
                                   columns = 'movie_id')
from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy) 
user_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index,
                              columns = rating_matrix.index)
for neighbor_size in [10,20,30,40,50,60]:
    print("Neighbor size = %d : RMSE = %.4f" %(neighbor_size, score(cf_knn, neighbor_size)))


Neighbor size = 10 : RMSE = 1.0287
Neighbor size = 20 : RMSE = 1.0156
Neighbor size = 30 : RMSE = 1.0138
Neighbor size = 40 : RMSE = 1.0133
Neighbor size = 50 : RMSE = 1.0134
Neighbor size = 60 : RMSE = 1.0142


-----------------

#### - 사용자의 평가경향을 고려한 CF

In [13]:
# user bias 고려하기

rating_mean = rating_matrix.mean(axis = 1) # full matrix에서 각 사용자의 평점 평균 구하기
rating_bias = (rating_matrix.T - rating_mean).T # 영화 평점과 각 사용자의 평균과의 차이(평점편차) 구하기

# ----------------------------------------------------------------------------------------------------
## 모든 user의 rating 평균과 영화의 평점편차 계산

def cf_knn_bias(user_id, movie_id, neighbor_size = 0):
    if movie_id in rating_bias: #주어진 영화가 rating_bias에 있는가 확인
        
        sim_scores = user_similarity[user_id].copy() 
        movie_ratings = rating_bias[movie_id].copy() # 해당 영화의 평점(rating_matrix) 대신 평점편차(rating_bias)를 복사해 가져오기
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index        
        
        movie_ratings = movie_ratings.drop(none_rating_idx) 
        sim_scores = sim_scores.drop(none_rating_idx)
        
        # Neigjbor size가 지정되지 않은 경우
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        
        # Neigjbor size가 지정된 경우
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id] # 예측치를 평점편차로 구했기 때문에 최종 예측값을 위해서 해당 사용자의 평균을 더하기
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    return prediction


score(cf_knn_bias, 30)

0.9424594051042088

-----------

#### - CF 정확도 개선 방안

<br>  

- 신뢰도 가중(significance Weighting)  
<br>
  

In [14]:
## 사용자별 공통 평가 수 계산

rating_binary1 = np.array((rating_matrix > 0).astype(float))
rating_binary2 = rating_binary1.T

counts = np.dot(rating_binary1, rating_binary2)
counts = pd.DataFrame(counts, index = rating_matrix.index, columns = rating_matrix.index).fillna(0)

# ----------------------------------------------------------------------------------------------------
def cf_knn_bias_sig(user_id, movie_id, neighbor_size = 0):
    if movie_id in rating_bias: #주어진 영화가 rating_bias에 있는가 확인
        
        sim_scores = user_similarity[user_id] # 현 user와 다른 사용자 간 유사도
        movie_ratings = rating_bias[movie_id] # 현 movie의 평점 편차 가져오기
        
        no_rating = movie_ratings.isnull() # 현 movie에 대한 rating없는 사용자 표시
        common_counts = counts[user_id] # 현 사용자와 다른 사용자간 공통 평가 아이템 수
        low_significance = common_counts < SIG_LEVEL # 공통으로 평가한 영화 수가 SIG_LEVEL보다 낮은 사용자 표시
        
        # 평가를 안했거나 SIG_LEVEL이 기준 이하인 user 제거
        none_rating_idx = movie_ratings[no_rating | low_significance].index 
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
        # Neigjbor size가 지정되지 않은 경우
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        
        # Neigjbor size가 지정된 경우
        else:
            if len(sim_scores) > MIN_RATINGS:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id] # 예측치를 평점편차로 구했기 때문에 최종 예측값을 위해서 해당 사용자의 평균을 더하기
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    return prediction


SIG_LEVEL = 3
MIN_RATINGS = 2
score(cf_knn_bias_sig, 30)

0.9424450344829609

--------------

#### - 사용자 기반 CF와 아이템 기반 CF

In [15]:
# trian dataset의 모든 item pair의 Cosine similarities 계산

from sklearn.metrics.pairwise import cosine_similarity

rating_matrix_t = np.transpose(rating_matrix)
matrix_dummy = rating_matrix_t.copy().fillna(0) # rating값을 matrix_dummy에 복사, NaN은 error를 발생시키므로 0으로 변경
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy) # 모든 사용자간의 유사도 계산; 943X943
item_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index, columns = rating_matrix.index) # user_similarity에 index 지정
item_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.090496,0.042861,0.034665,0.258852,0.292140,0.336797,0.225246,0.025756,0.227712,...,0.296071,0.120853,0.232871,0.120753,0.157563,0.081103,0.212482,0.077521,0.141614,0.261746
2,0.090496,1.000000,0.092486,0.079108,0.000000,0.165545,0.078515,0.079010,0.052193,0.082401,...,0.115150,0.227151,0.246743,0.289394,0.216860,0.136960,0.194441,0.053799,0.151783,0.104630
3,0.042861,0.092486,1.000000,0.233548,0.000000,0.076301,0.052300,0.041163,0.024171,0.055120,...,0.043963,0.058261,0.150726,0.050590,0.081245,0.020682,0.155785,0.087201,0.156142,0.035703
4,0.034665,0.079108,0.233548,1.000000,0.041247,0.065404,0.063203,0.102190,0.059439,0.079243,...,0.068471,0.000000,0.135141,0.183337,0.064932,0.040689,0.176061,0.073521,0.185257,0.035119
5,0.258852,0.000000,0.000000,0.041247,1.000000,0.189023,0.298576,0.173510,0.039100,0.166260,...,0.268036,0.000000,0.054547,0.084077,0.038552,0.012848,0.212759,0.078994,0.135824,0.213200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.081103,0.136960,0.020682,0.040689,0.012848,0.092814,0.123364,0.065311,0.055927,0.083195,...,0.045211,0.359488,0.241969,0.201092,0.384751,1.000000,0.098169,0.131438,0.042908,0.124625
940,0.212482,0.194441,0.155785,0.176061,0.212759,0.250926,0.256217,0.238411,0.100385,0.240244,...,0.232582,0.072015,0.179645,0.187992,0.170721,0.098169,1.000000,0.105322,0.230169,0.137689
941,0.077521,0.053799,0.087201,0.073521,0.078994,0.054688,0.046648,0.096159,0.084214,0.056728,...,0.027231,0.192164,0.227534,0.139525,0.273634,0.131438,0.105322,1.000000,0.100504,0.102359
942,0.141614,0.151783,0.156142,0.185257,0.135824,0.209803,0.207960,0.121996,0.048751,0.166153,...,0.156234,0.050362,0.106845,0.112012,0.069254,0.042908,0.230169,0.100504,1.000000,0.139615


In [16]:
import numpy as np
import pandas as pd

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

# train, test 데이터 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

# 정확도(RMSE)
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

# train 데이터로 Full matrix 구하기  
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')


# train set의 모든 가능한 아이템 pair의 Cosine similarities 계산

from sklearn.metrics.pairwise import cosine_similarity
rating_matrix_t = np.transpose(rating_matrix)
matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity, index=rating_matrix_t.index, columns=rating_matrix_t.index)

# 주어진 영화의 (movie_id) 가중평균 rating을 계산하는 함수, 
# 가중치는 주어진 아이템과 다른 아이템 간의 유사도(item_similarity)
def CF_IBCF(user_id, movie_id):
    if movie_id in item_similarity:      # 현재 영화가 train set에 있는지 확인

        sim_scores = item_similarity[movie_id]  # 현재 영화와 다른 영화의 similarity 값 가져오기
        user_rating = rating_matrix_t[user_id] # 현 사용자의 모든 rating 값 가져오기
        
        non_rating_idx = user_rating[user_rating.isnull()].index # 사용자가 평가하지 않은 영화 index 가져오기
        user_rating = user_rating.dropna() # 사용자가 평가하지 않은 영화 제거
        sim_scores = sim_scores.drop(non_rating_idx) # 사용자가 평가하지 않은 영화의 similarity 값 제거
        
        mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
        # 현 영화에 대한 예상 rating 계산, 가중치는 현 영화와 사용자가 평가한 영화의 유사도
    else:
        mean_rating = 3.0
    return mean_rating

# 정확도
score(CF_IBCF)

1.011683943745874

--------------