In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

## KNN 演算法
- 將畢氏定理應用在更高維的空間

In [3]:
standard_movie_score = {
    "love":5,
    "internation":7,
    "sword":8,
    "car":10,
    "action":2
}
standard_movie_score

{'love': 5, 'internation': 7, 'sword': 8, 'car': 10, 'action': 2}

In [5]:
movie_scores = [{
    "love":2,
    "internation":8,
    "sword":8,
    "car":5,
    "action":6
}, {
    "love":5,
    "internation":6,
    "sword":9,
    "car":2,
    "action":5
}, {
    "love":8,
    "internation":2,
    "sword":0,
    "car":0,
    "action":10
}, {
    "love":5,
    "internation":8,
    "sword":8,
    "car":8,
    "action":3
}]
movie_scores

[{'love': 2, 'internation': 8, 'sword': 8, 'car': 5, 'action': 6},
 {'love': 5, 'internation': 6, 'sword': 9, 'car': 2, 'action': 5},
 {'love': 8, 'internation': 2, 'sword': 0, 'car': 0, 'action': 10},
 {'love': 5, 'internation': 8, 'sword': 8, 'car': 8, 'action': 3}]

In [8]:
movie_names = ["復仇者聯盟", "決戰中途島", "冰雪奇緣", "雙子殺手"]

In [15]:
result = {} # 儲存影片相似度值
for movie_name, score in zip(movie_names, movie_scores):
    print(movie_name, ":", score)
    distance = 0
    for k in score.keys():
        distance += ((score.get(k) - standard_movie_score.get(k)) ** 2)
#     print(distance)
    result[movie_name] = math.sqrt(distance)

result

復仇者聯盟 : {'love': 2, 'internation': 8, 'sword': 8, 'car': 5, 'action': 6}
決戰中途島 : {'love': 5, 'internation': 6, 'sword': 9, 'car': 2, 'action': 5}
冰雪奇緣 : {'love': 8, 'internation': 2, 'sword': 0, 'car': 0, 'action': 10}
雙子殺手 : {'love': 5, 'internation': 8, 'sword': 8, 'car': 8, 'action': 3}


{'復仇者聯盟': 7.14142842854285,
 '決戰中途島': 8.660254037844387,
 '冰雪奇緣': 16.186414056238647,
 '雙子殺手': 2.449489742783178}

In [18]:
the_best_related_movie = min(result.items(), key=lambda item: item[1])
the_best_related_movie

('雙子殺手', 2.449489742783178)

### 使用 numpy

In [43]:
movies_df = pd.DataFrame()
movies_df["name"] = movie_names
movies_df = pd.concat([movies_df, pd.DataFrame(movie_scores)], axis=1) # 合併
movies_df

Unnamed: 0,name,love,internation,sword,car,action
0,復仇者聯盟,2,8,8,5,6
1,決戰中途島,5,6,9,2,5
2,冰雪奇緣,8,2,0,0,10
3,雙子殺手,5,8,8,8,3


In [48]:
standard_movie_score_ser = pd.Series(standard_movie_score)
print(standard_movie_score_ser)
print(standard_movie_score_ser.index)

love            5
internation     7
sword           8
car            10
action          2
dtype: int64
Index(['love', 'internation', 'sword', 'car', 'action'], dtype='object')


In [49]:
movies_df.loc[0, standard_movie_score_ser.index]

love           2
internation    8
sword          8
car            5
action         6
Name: 0, dtype: object

In [57]:
distances = []
for i in movies_df.index:
    distance = movies_df.loc[i, standard_movie_score_ser.index] - standard_movie_score_ser
    distance = distance ** 2
    distance = np.sum(distance)
    distance = math.sqrt(distance)
    distances.append(distance)
distances

[7.14142842854285, 8.660254037844387, 16.186414056238647, 2.449489742783178]

In [58]:
movies_df["distance"] = distances
movies_df

Unnamed: 0,name,love,internation,sword,car,action,distance
0,復仇者聯盟,2,8,8,5,6,7.141428
1,決戰中途島,5,6,9,2,5,8.660254
2,冰雪奇緣,8,2,0,0,10,16.186414
3,雙子殺手,5,8,8,8,3,2.44949


In [63]:
min_index = movies_df["distance"].idxmin()
print(movies_df.loc[min_index, :])
print(type(movies_df.loc[min_index, :]))

name              雙子殺手
love                 5
internation          8
sword                8
car                  8
action               3
distance       2.44949
Name: 3, dtype: object
<class 'pandas.core.series.Series'>
