In [1]:
import argparse
import json
import numpy as np

- 데이터가 사람과 영화선호도 구성되어 있다면
- 두 사람을 서로 비교하는 방법을 알아야 한다 - 유사성 점수
- 유사성 점수 : 두 포인트가 얼마나 유사한지
- 유클리드 점수, 피어슨 점수
- 유클리드 점수가 0과 1 사이가 되더록, 두 개체간의 유클리드 거리가 크면 유클리드 점수는 낮다
- 피어슨 점수 : -1 ~ 1 사이의 값을 갖는다

In [2]:
def build_arg_parser():
  parser = argparse.ArgumentParser(description='Compute similarity score')
  parser.add_argument('--user1', dest='user1', required=True, help='First user')
  parser.add_argument('--user2', dest='user2', required=True, help='Second user')
  parser.add_argument('--score-type', dest='score_type', required=True, choices=['Euclidean', 'Pearson'], help='Similarity metric to be used')
  return parser

In [3]:
def euclidean_score(dataset, user1, user2):
  if user1 not in dataset:
    raise TypeError(f"Cannot find {user1} in the dataset")
  if user2 not in dataset:
    raise TypeError(f"Cannot find {user2} in the dataset")
  common_movies = {}
  for item in dataset[user1]:
    if item in dataset[user2]:
      common_movies[item] = 1
  if len(common_movies) == 0:
    return 0
  squared_diff = []
  for item in dataset[user1]:
    if item in dataset[user2]:
      squared_diff.append(np.square(dataset[user1][item] - dataset[user2][item]))
  return 1 / (1+ np.sqrt(np.sum(squared_diff)))

def pearson_score(dataset, user1, user2):
  if user1 not in dataset:
    raise TypeError(f"Cannot find {user1} in the dataset")
  if user2 not in dataset:
    raise TypeError(f"Cannot find {user2} in the dataset")
  common_movies = {}
  for item in dataset[user1]:
    if item in dataset[user2]:
      common_movies[item] = 1
  if len(common_movies) == 0:
    return 0

  user1_sum = np.sum(dataset[user1][item] for item in common_movies)
  user2_sum = np.sum(dataset[user2][item] for item in common_movies)

  user1_squared_sum = np.sum(np.square(dataset[user1][item]) for item in common_movies)
  user2_squared_sum = np.sum(np.square(dataset[user2][item]) for item in common_movies)

  sum_of_products = np.sum(np.sum(dataset[user1][item] * dataset[user2][item]) for item in common_movies)

  num_rations = len(common_movies)

  sxy = sum_of_products - (user1_sum * user2_sum / num_rations)
  sxx = user1_squared_sum - np.square(user1_sum)/ num_rations
  syy = user2_squared_sum - np.square(user2_sum) / num_rations

  if sxx*syy == 0:
    return 0

  return sxy / np.sqrt(sxx * syy)

In [5]:
import pandas as pd

with open('/content/drive/MyDrive/machinelearning/ratings.json', 'r') as f:
  data = json.loads(f.read())

In [8]:
score_type = 'Pearson'
if score_type == 'Euclidean':
  print('Euclidean score')
  print(euclidean_score(data, 'David Smith', 'Bill Duffy'))
else:
  print('Pearson score')
  print(pearson_score(data, 'David Smith', 'Bill Duffy'))

Pearson score
0.9909924304103233


  user1_sum = np.sum(dataset[user1][item] for item in common_movies)
  user2_sum = np.sum(dataset[user2][item] for item in common_movies)
  user1_squared_sum = np.sum(np.square(dataset[user1][item]) for item in common_movies)
  user2_squared_sum = np.sum(np.square(dataset[user2][item]) for item in common_movies)
  sum_of_products = np.sum(np.sum(dataset[user1][item] * dataset[user2][item]) for item in common_movies)


협업필터링을 사용해서 유사한 사용자 찾기
- 협업필터링 : 새로운 객체를 찾아내기 위해 데이터 세트 내 객체 간에 패턴을 찾는 프로세스
- 데이터 세트에서 유사한 사용자를 살펴봄으로써 추천을 제공

In [15]:
def find_similar_users(dataset, user, num_users):
  if user not in dataset:
    raise TypeError(f'Cannot find {user} in the dataset')
  scores = np.array([ [x, pearson_score(dataset, user, x)] for x in dataset if x != user])
  scores_sorted = np.argsort(scores[:,1])[::-1]
  top_users = scores_sorted[:num_users]
  return scores[top_users]

In [16]:
with open('/content/drive/MyDrive/machinelearning/ratings.json', 'r') as f:
  data = json.loads(f.read())

user = 'Bill Duffy'
similar_user = find_similar_users(data, 'Bill Duffy', 3)
similar_user

  user1_sum = np.sum(dataset[user1][item] for item in common_movies)
  user2_sum = np.sum(dataset[user2][item] for item in common_movies)
  user1_squared_sum = np.sum(np.square(dataset[user1][item]) for item in common_movies)
  user2_squared_sum = np.sum(np.square(dataset[user2][item]) for item in common_movies)
  sum_of_products = np.sum(np.sum(dataset[user1][item] * dataset[user2][item]) for item in common_movies)


array([['David Smith', '0.9909924304103233'],
       ['Samuel Miller', '0.8798679229074933'],
       ['Adam Cohen', '0.8575894485643643']], dtype='<U32')

영화 추천 시스템 구축

In [22]:
def get_recommendations(dataset, input_user):
  if input_user not in dataset:
    raise TypeError(f"Cannot find {input_user} in the dataset")

  overall_scores = {}
  similarity_scores = {}
  for user in [x for x in dataset if x != input_user]:
    similarity_score = pearson_score(dataset, input_user, user)
    if similarity_score <= 0:
      continue
    filtered_list = [x for x in dataset[user] if x not in dataset[input_user] or dataset[input_user][x] == 0]
    for item in filtered_list:
      overall_scores.update({item:dataset[user][item]*similarity_score})
      similarity_scores.update({item:similarity_score})
    if len(overall_scores) == 0:
      return "no recommendations possible"
    movie_scores = np.array([[score/similarity_scores[item], item] for item, score in overall_scores.items()])
    movie_scores = movie_scores[np.argsort(movie_scores[:,0])[::-1]]
    movie_recommendations = [movie for _, movie in movie_scores]
    return movie_recommendations

In [23]:
movie = get_recommendations(data, 'Chris Duncan')
movie

  user1_sum = np.sum(dataset[user1][item] for item in common_movies)
  user2_sum = np.sum(dataset[user2][item] for item in common_movies)
  user1_squared_sum = np.sum(np.square(dataset[user1][item]) for item in common_movies)
  user2_squared_sum = np.sum(np.square(dataset[user2][item]) for item in common_movies)
  sum_of_products = np.sum(np.sum(dataset[user1][item] * dataset[user2][item]) for item in common_movies)


['Goodfellas', 'Scarface', 'Vertigo']

In [24]:
for idx, data in enumerate(movie):
  print(f"{idx+1}. {data}")

1. Goodfellas
2. Scarface
3. Vertigo
