## 1. Dataset

In [1]:
from scipy import spatial

In [2]:
rating_df = pd.read_csv('ratings_small.csv')
rating_df.drop('timestamp', axis=1,inplace=True)
rating_df.tail()

Unnamed: 0,userId,movieId,rating
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5
100003,671,6565,3.5


## 2. check dataset

In [3]:
unique_user = rating_df['userId'].unique()
len(rating_df['userId'].unique())

671

In [4]:
unique_movie = rating_df['movieId'].unique()
len(rating_df['movieId'].unique())

9066

In [5]:
unique_rating = rating_df['rating'].unique()
len(unique_rating), sorted(unique_rating)

(10, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])

In [6]:
# rating 분포
rating_count_df = rating_df.groupby("rating").size().reset_index(name='user_rating_count')
rating_count_df

Unnamed: 0,rating,user_rating_count
0,0.5,1101
1,1.0,3326
2,1.5,1687
3,2.0,7271
4,2.5,4449
5,3.0,20064
6,3.5,10538
7,4.0,28750
8,4.5,7723
9,5.0,15095


In [7]:
# user 분포
user_count_df = rating_df.groupby('userId').size().reset_index(name='user_rating_count')
user_count_df = user_count_df.sort_values(by=['user_rating_count'], ascending=False)
user_count_df.head(3)

Unnamed: 0,userId,user_rating_count
546,547,2391
563,564,1868
623,624,1735


In [8]:
#movie 분포
movie_count_df = rating_df.groupby('movieId').size().reset_index(name='movie_rating_count')
movie_count_df = movie_count_df.sort_values(by=['movie_rating_count'], ascending=False)
movie_count_df.head(3)

Unnamed: 0,movieId,movie_rating_count
321,356,341
266,296,324
284,318,311


### 3. Preprocessing 

In [9]:
# user의 최소평가 수, movie 최소 평가수
user_limit, movie_limit = 100, 100

In [10]:
filtered_userId = list(user_count_df[user_count_df['user_rating_count'] > user_limit]['userId'])
len(filtered_userId), filtered_userId[:5]
# user data count :671 -> 258

(258, [547, 564, 624, 15, 73])

In [11]:
filtered_movieId = list(movie_count_df[movie_count_df['movie_rating_count'] > movie_limit]['movieId'])
len(filtered_movieId), filtered_movieId[:5]
# movie data count: 9066 -> 149

(149, [356, 296, 318, 593, 260])

In [12]:
filtered_df = rating_df[rating_df['userId'].isin(filtered_userId)]
filtered_df = filtered_df[filtered_df['movieId'].isin(filtered_movieId)]
print(len(filtered_df))
filtered_df.tail(3)

15567


Unnamed: 0,userId,movieId,rating
99987,671,5349,4.0
99989,671,5445,4.5
99994,671,5952,5.0


### 4.pivot

In [13]:
user_df = filtered_df.pivot_table(values = 'rating', index=['userId'], columns=["movieId"], \
                        aggfunc = np.average, fill_value = 0, dropna = False)
user_df.tail()

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
659,0.0,0.0,3.0,0.0,5.0,4.0,0.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
664,3.5,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.5,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.5,5.0
665,0.0,3.0,0.0,0.0,0.0,4.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
671,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 5. Function

#### (2) cosine_similarity

In [14]:
def cosine_similarity(vector_1, vector_2):
    
    idx = vector_1.nonzero()[0]
    if len(idx) == 0:
        return 0
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    idx = vector_2.nonzero()[0]
    if len(idx) == 0:
        return 0
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    #print(vector_1, vector_2)
    return 1 - spatial.distance.cosine(vector_1, vector_2) 

In [15]:
vector_1 = np.array([1,0,5,0,1,2])
vector_2 = np.array([1,2,3,4,0,0])
cosine_similarity(vector_1, vector_2)

0.9922778767136677

In [16]:
def similarity_matrix(df, similarity_func):
    idx = df.index
    df = df.T
    # df.loc[1]
    matrix = []
    for idx_1, value_1 in df.items():
        row = []
        for idx_2, value_2 in df.items():
            row.append(similarity_func(value_1, value_2))
        matrix.append(row)
        
    return pd.DataFrame(matrix, columns = idx, index = idx)
        
    

In [17]:
#test code - similarity
sm_df = similarity_matrix(user_df, cosine_similarity)
sm_df.head(3)

userId,4,8,15,17,19,21,22,23,26,30,...,647,648,652,654,655,656,659,664,665,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1.0,0.991116,0.956762,0.948457,0.985932,0.980286,0.981591,0.982744,0.986789,0.979119,...,0.979131,0.951088,0.986368,0.991149,0.983037,0.997707,0.970241,0.994377,0.968998,0.985579
8,0.991116,1.0,0.914253,0.966828,0.972568,0.985269,0.964117,0.98201,0.984022,0.971471,...,0.974777,0.947942,0.970261,0.988689,0.979823,0.998645,0.972875,0.990196,0.974638,0.982713
15,0.956762,0.914253,1.0,0.914953,0.950125,0.950927,0.906975,0.923247,0.888292,0.920392,...,0.957841,0.856947,0.893839,0.917356,0.900642,0.873927,0.938017,0.930106,0.903008,0.892096


#### (2) mean score

In [25]:
def mean_score(df, sm_df, target, closer_count = 10):
    
    sms_df = sm_df.drop(target)  # .drop - > row
    sms_df = sms_df.sort_values(target, ascending=False)
    sms_df = sms_df[target][:closer_count]
    
    smsw_df = df.loc[sms_df.index]
    
    ms_df = pd.DataFrame(columns = df.columns)
    ms_df.loc["user"] = df.loc[target]
    ms_df.loc["mean"] = smsw_df.mean()
    
    return ms_df

## test cod - mean_score
ms_df = mean_score(user_df, sm_df, 4, 5)
ms_df

### (4) recommend

In [27]:
def recommend(ms_df):
    recommend_df = ms_df.T
    recommend_df = recommend_df[recommend_df['user'] == 0]
    recommend_df = recommend_df.sort_values("mean", ascending = False)
    return recommend_df, list(recommend_df.index)

In [28]:
# test code - recommend

recommend_df, recommend_list = recommend(ms_df)

In [31]:
recommend_list[:10]


[4226, 2858, 2959, 4973, 912, 50, 5952, 4306, 3996, 4993]

In [37]:
recommend_df.head(7)

Unnamed: 0_level_0,user,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4226,0.0,3.0
2858,0.0,2.8
2959,0.0,2.7
4973,0.0,2.7
912,0.0,2.5
50,0.0,2.4
5952,0.0,2.1


### (5) MAE

In [40]:
def mae(value, pred):
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    idx = pred.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    return np.absolute(sum(value - pred))/len(idx)
    

In [41]:
### test code - mae
mae(ms_df.loc['user'], ms_df.loc['mean'])

3.5749999999999997

In [43]:
def evaluate(df, sm_df, algorithm, closer_count=10):
    
    users = df.index
    evaluate_list = []
    
    for target in users:
        result_df = mean_score(df, sm_df, target, closer_count)
        evaluate_list.append(algorithm(result_df.loc['user'], ms_df.loc["mean"]))
    return np.average(evaluate_list)

In [44]:
# test code - evaluate
evaluate(user_df, sm_df, mae, 10)

2.607068090943497

In [47]:
def euclidean_similarity(vector_1, vector_2):
    idx = vector_1.nonzero()[0]
    if len(idx) == 0:
        return 0
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    idx = vector_2.nonzero()[0]
    if len(idx) == 0:
        return 0
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    #print(vector_1, vector_2)
    return np.linalg.norm(vector_1 - vector_2) 

In [48]:
# test code -euclidean_similarity
vector_1 = np.array([1,0,5,0,1,2])
vector_2 = np.array([1,2,3,4,0,0])
euclidean_similarity(vector_1, vector_2)

2.0

In [49]:
def find_best(user_df, similarity, closer_count):
    sm_df = similarity_matrix(user_df, similarity)
    
    return evaluate(user_df, sm_df, mae, closer_count)

In [51]:
find_best(user_df, euclidean_similarity, 5)


2.607068090943497

In [69]:
similarity_str = ['euclidean_similarity', 'cosine_similarity']
similarity_list = [euclidean_similarity, cosine_similarity]
closer_start, closer_end =20, 25

for idx, similarity in enumerate(similarity_list):
    print("similarity : ", similarity)
    for closer_count in range(closer_start, closer_end+1):
        print(closer_count, find_best(user_df, similarity, closer_count))

similarity :  <function euclidean_similarity at 0x10d454268>
20 2.607068090943497
21 2.607068090943497
22 2.607068090943497
23 2.607068090943497
24 2.607068090943497
25 2.607068090943497
similarity :  <function cosine_similarity at 0x104ff5d90>
20 2.607068090943497
21 2.607068090943497
22 2.607068090943497
23 2.607068090943497
24 2.607068090943497
25 2.607068090943497
