In [1]:
import numpy as np
import pandas as pd
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

In [2]:
# import dataset

df = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [3]:
import sklearn
from typing import List, Set, Dict, Tuple
from numpy.linalg import norm
from numpy import dot
from sklearn.model_selection import train_test_split

# split dataset : train(70) vs test(30)

train, test = train_test_split(df, test_size = 0.3, random_state = 25)

# calculate weights using train set
# userId 1, userId 2 => cosine similarity

def cosine_similarity(df:pd.DataFrame, id_1:int, id_2:int):
    target = df[(df['userId'] == id_1) | (df['userId'] == id_2)]
    pivot = pd.pivot_table(target, values = 'rating', index = 'userId', columns = 'movieId')
    pivot = pivot.rename_axis(None).rename_axis(None, axis = 1).fillna(0)

    a = pivot.iloc[0]
    b = pivot.iloc[1]
    
    a = [v for v in a]
    b = [v for v in b]
    
    cos_sim = round(dot(a,b) / (norm(a) * norm(b)), 3)
    return cos_sim

# make a prediction


In [4]:
# 2, 29, 32,47 이들사이에 유사도를 구해서 각자가 안본 영화에 대한 평점을 내려보자

# 2번을 기준으로 29, 32, 47과의 유사도를 구하자
# 코사인 유사도

import itertools

def create_df(df:pd.DataFrame, ids:List[int]) -> pd.DataFrame:
    dfs = []
    for _id in ids:
        _df = df[df['userId'] == _id]
        dfs.append(_df)
        
    frames = [_df for _df in dfs]
    res_df = pd.concat(frames)
    return res_df

def pivoting(df:pd.DataFrame) -> pd.DataFrame:
    df = pd.pivot_table(df, values = 'rating', index = 'userId', columns = 'movieId')
    df = df.rename_axis(None).rename_axis(None, axis = 1).fillna(0)
    return df

def cos_sim(df:pd.DataFrame) -> List:
    iterable = df.index
    users = list(itertools.combinations(iterable, 2))
    
    users_cos_sim = []
    for user in users:
        a = df[df.index == user[0]]
        b = df[df.index == user[1]]
        
        a = [rating for v in a.values for rating in v]
        b = [rating for v in b.values for rating in v]
        
        cos_sim = round(dot(a,b) / (norm(a) * norm(b)), 3)
        result = [user, cos_sim]
        users_cos_sim.append(result)
    return users_cos_sim
        
# 특정 유저가 가지게 되는 weighted average approach 구하기
def weighted_avg(df:pd.DataFrame, user_id:int):
    rating = [rating for ratings in df[df.index == user_id].values for rating in ratings]
    avg = round(np.mean(rating), 3)
    return avg

# 유사도와 weighted average를 이용해서 2번이 아직 평가하지 않은 영화에 대한 평점을 예측해보자