## K-Nearest Neighbors

$$Distance = \sqrt{(A_1 - B_1)^2 + (A_2 - B_2)^2 + ... + (A_n - B_n)^2}$$

In [1]:
star_wars = [125, 1977, 11000000]
raiders = [115, 1981, 18000000]
mean_girls = [97, 2004, 17000000]

def distance(movie1, movie2):
    distance = 0
    for i in range(len(movie1)):
        distance += (movie1[i] - movie2[i])**2
    distance = distance** 0.5
    return distance

print(distance(star_wars, raiders))
print(distance(star_wars, mean_girls))

7000000.000008286
6000000.000126083


In [2]:
import pandas as pd
df = pd.read_csv('movies.csv', encoding='utf8')
movie_dataset = {}
movie_labels = {}
dic_min_max = {}
dic_min_max['budget'] = [min(df['budget']), max(df['budget'])]
dic_min_max['runtime'] = [min(df['runtime']), max(df['runtime'])]
dic_min_max['release_year'] = [min(df['release_year']), max(df['release_year'])]
for i, title in enumerate(df['title']):
    movie_dataset[title] = [df.loc[i, 'budget'], df.loc[i, 'runtime'], df.loc[i, 'release_year']]
    movie_labels[title] = df.loc[i, 'label']
    
print(movie_dataset['Bruce Almighty'])
print(movie_labels['Bruce Almighty'])
print(dic_min_max)
df.head(2)

[80000000, 101, 2003]
0
{'budget': [10000, 380000000], 'runtime': [63, 338], 'release_year': [1916, 2016]}


Unnamed: 0,title,budget,release_year,runtime,vote_average,label
0,Avatar,237000000,2009,162,7.2,1
1,Pirates of the Caribbean: At World's End,300000000,2007,169,6.9,0


In [3]:
def distance(movie1, movie2):
    squared_difference = 0
    for i in range(len(movie1)):
        squared_difference += (movie1[i] - movie2[i]) ** 2
    final_distance = squared_difference ** 0.5
    return final_distance

def classify(unknown, dataset, labels, k):
    distances = []
    #Looping through all points in the dataset
    for title in dataset:
        movie = dataset[title]
        distance_to_point = distance(movie, unknown)
        #Adding the distance and point associated with that distance
        distances.append([distance_to_point, title])
    distances.sort()
    #Taking only the k closest points
    neighbors = distances[0:k]
    num_good = 0
    num_bad = 0
    for neighbor in neighbors:
        title = neighbor[1]
        if labels[title] == 0:
            num_bad += 1
        elif labels[title] == 1:
            num_good += 1
    if num_good > num_bad:
        return 1
    else:
        return 0

print(classify([.4, .2, .9], movie_dataset, movie_labels, 5))

0


## Normalize data

In [4]:
release_dates = [1897, 1998, 2000, 1948, 1962, 1950, 1975, 1960, 2017, 1937, 1968, 1996, 1944, 1891, 1995, 1948, 2011, 1965, 1891, 1978]

def min_max_normalize(lst):
    minimum = min(lst)
    maximum = max(lst)
    diff = maximum - minimum
    normalized = [(i-minimum)/diff for i in lst]
    return normalized

print(min_max_normalize(release_dates)[:3])

[0.047619047619047616, 0.8492063492063492, 0.8650793650793651]


In [5]:
def normalize_point(lst):
    lst_col = ['budget', 'runtime', 'release_year']
    for i in range(len(lst)):
        min_v = dic_min_max[lst_col[i]][0]
        max_v = dic_min_max[lst_col[i]][1]
        norm_v = (lst[i] - min_v) / (max_v - min_v)
        lst[i] = norm_v
    return lst
normalize_point([350000, 132, 2017])

[0.0008947603884312745, 0.2509090909090909, 1.01]

In [6]:
print("Call Me By Your Name" in movie_dataset)
my_movie = [350000, 132, 2017]
normalized_my_movie = normalize_point(my_movie)
print(normalized_my_movie)
print(classify(normalized_my_movie, movie_dataset, movie_labels, 5))

False
[0.0008947603884312745, 0.2509090909090909, 1.01]
0


In [7]:
lst_cols = ['budget', 'runtime', 'release_year']
df_norm = df.copy()
df_norm[lst_cols] = (df_norm[lst_cols]-df_norm[lst_cols].min())/(df_norm[lst_cols].max()-df_norm[lst_cols].min())
df_norm.head(2)

Unnamed: 0,title,budget,release_year,runtime,vote_average,label
0,Avatar,0.623674,0.93,0.36,7.2,1
1,Pirates of the Caribbean: At World's End,0.789468,0.91,0.385455,6.9,0


## Creat random training set, testing set

In [8]:
import random

print(len(df_norm))
random.seed(3)
lst_train_idx = random.sample(range(len(df_norm)), int(len(df_norm)*0.8))
print(lst_train_idx[:3])
df_train = df_norm[df_norm.index.isin(lst_train_idx)]
df_val = df_norm[~df_norm.index.isin(lst_train_idx)]
print(len(df_train), len(df_val))

training_set, validation_set = {}, {}
training_labels, validation_labels = {}, {}
for i, k in enumerate(movie_dataset):
    df_tmp = df_norm[df_norm['title']==k].reset_index(drop=True)
    if i in lst_train_idx:
        training_set[k] = [df_tmp.loc[0,'budget'], df_tmp.loc[0,'runtime'], df_tmp.loc[0,'release_year']]
        training_labels[k] = movie_labels[k]
    else:
        validation_set[k] = [df_tmp.loc[0,'budget'], df_tmp.loc[0,'runtime'], df_tmp.loc[0,'release_year']]
        validation_labels[k] = movie_labels[k]
df_train.head(2)
print(len(training_set), len(validation_set))

3690
[974, 2427, 2229]
2952 738
2951 737


In [9]:
# Training and Validation Sets
test = list(validation_set.keys())[0]
print(test)
print(validation_set[test])
print(validation_labels[test])
guess = classify(validation_set[test], training_set, training_labels, 5)
print(guess)
if guess == validation_labels[test]:
    print("Correct!")
else:
    print("Wrong!")

Spider-Man 3
[0.6789389194452485, 0.27636363636363637, 0.91]
0
0
Correct!


## Accuracy

In [10]:
def find_validation_accuracy(training_set, training_labels, validation_set, validation_labels, k):
    num_correct = 0.0
    for title in validation_set:
        guess = classify(validation_set[title], training_set, training_labels, k)
        if guess == validation_labels[title]:
            num_correct += 1
    return num_correct / len(validation_set)

print(find_validation_accuracy(training_set, training_labels, validation_set, validation_labels, 3))

0.7924016282225237


In [11]:
from sklearn.neighbors import KNeighborsClassifier

training_set_2 = [v for k, v in training_set.items()]
training_labels_2 = [v for k, v in training_labels.items()]

classifier = KNeighborsClassifier(5)

classifier.fit(training_set_2, training_labels_2)
lst = [[.45, .2, .5], [.25, .8, .9], [.1, .1, .9]]
print(classifier.predict(lst))

[0 0 0]
