In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from scipy import spatial
import operator

In [2]:
anime = pd.read_csv("my-data/anime.csv", error_bad_lines=False)

# We have missing data, so we need to clean.
# from analyzing the data, if the type is a movie and the number of episodes is unkown, then we can put 1.  
# For OVA(Original Video Animation), these are generally one/two episode long animes. I’ve decided to fill the unknown numbers of episodes with 1 again.

# For all the other animes with unknown number of episodes, I’ve filled the known values with the median

In [3]:
anime.loc[(anime["type"]=="OVA") & (anime["episodes"]=="Unknown"),"episodes"] = "1"

anime.loc[(anime["type"] == "Movie") & (anime["episodes"] == "Unknown")] = "1"
  
anime["episodes"] = anime["episodes"].map(lambda x:np.nan if x=="Unknown" else x)

anime["episodes"].fillna(anime["episodes"].median(),inplace = True)


anime["rating"] = anime["rating"].astype(float)
anime["rating"].fillna(anime["rating"].median(),inplace = True)

In [4]:
anime_features = pd.concat([anime["genre"].str.get_dummies(sep=","),
                            pd.get_dummies(anime[["type"]]),
                            anime[["rating"]],anime["episodes"]],axis=1)
# you can see the features by using anime_features.columns

In [5]:
#I used MinMaxScaler from scikit-learn as it scales the values from 0–1.
min_max_scaler = MinMaxScaler()
anime_features = min_max_scaler.fit_transform(anime_features)
np.round(anime_features,2)
# number 2 in round means two decimal points

array([[ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.93,  0.  ],
       [ 1.  ,  0.  ,  0.  , ...,  1.  ,  0.92,  0.03],
       [ 0.  ,  0.  ,  1.  , ...,  1.  ,  0.92,  0.03],
       ..., 
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.43,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.44,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.5 ,  0.  ]])

# The scaling function (MinMaxScaler) returns a numpy array containing the features. Then we fit the KNN model from scikit learn to the data and calculate the nearest neighbors for each distances. In this case I’ve used the unsupervised NearestNeighbors method for implementing neighbor searches.

In [6]:
nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(anime_features)

distances, indices = nbrs.kneighbors(anime_features)

In [7]:
#  Returns the index of the anime if (given the full name)
def get_index_from_name(name):
    return anime[anime["name"]==name].index.tolist()[0]
    
all_anime_names = list(anime.name.values)

In [8]:
# Prints the top K similar animes after querying
def print_similar_animes(query=None):

    if query:
        found_id = get_index_from_name(query)
        for id in indices[found_id][1:]:
            print(anime.ix[id]["name"])
            
print("Start of KNN Recommendation")
pred=print_similar_animes(query="Naruto")

Start of KNN Recommendation
Naruto: Shippuuden
Katekyo Hitman Reborn!
Dragon Ball Z
Dragon Ball Kai
Bleach
Dragon Ball Kai (2014)
Shijou Saikyou no Deshi Kenichi
Rekka no Honoo
Sakigake!! Otokojuku
Medaka Box Abnormal
Kenyuu Densetsu Yaiba
Ben-To
Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi
Kurokami The Animation
Boruto: Naruto the Movie
Naruto x UT
Naruto: Shippuuden Movie 4 - The Lost Tower
Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono
Virtua Fighter


# loading another dataset

In [9]:
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('my-data/u.data', sep='\t', names=r_cols, usecols=range(3))
ratings.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


# Now, we'll group everything by movie ID(item_id), and compute the total number of ratings (each movie's popularity) and the average rating for every movie.
# The raw number of ratings isn't very useful for computing distances between movies, so we'll create a new DataFrame that contains the normalized number of ratings. So, a value of 0 means nobody rated it, and a value of 1 will mean it's the most popular movie there is.

In [10]:
movieProperties = ratings.groupby('item_id').agg({'rating': [np.size, np.mean]})
print(movieProperties.head())

movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedNumRatings.head()

        rating          
          size      mean
item_id                 
1          452  3.878319
2          131  3.206107
3           90  3.033333
4          209  3.550239
5           86  3.302326


Unnamed: 0_level_0,size
item_id,Unnamed: 1_level_1
1,0.774914
2,0.223368
3,0.152921
4,0.357388
5,0.146048


# Now, let's get the genre information from the u.item file. The way this works is there are 19 fields, each corresponding to a specific genre - a value of '0' means it is not in that genre, and '1' means it is in that genre. A movie may have more than one genre associated with it.

# Then, we'll put together everything into one big Python dictionary called movieDict. Each entry will contain the movie name, list of genre values, the normalized popularity score, and the average rating for each movie.

In [11]:
movieDict = {}
with open('my-data/u.item') as f:
    temp = ''
    for line in f:
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = map(int, genres)
        movieDict[movieID] = (name, genres, movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))
# For example, here's the record we end up with for movie ID 1, (Toy Story)
movieDict[1] 
# you can change the number of movieDict[num]

('Toy Story (1995)',
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 0.77491408934707906,
 3.8783185840707963)

# Now, let's create a function that computes the (distance) between two movies based on how similar their genres are, and how similar their popularity is.

In [13]:
def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance
# For example,here we compute the distance between two movies (movie id 2 and movie id 4)
print(ComputeDistance(movieDict[1], movieDict[4]))
# you can compute any other movies by changing the movieDict[number]
print movieDict[1]
print movieDict[4]

1.08419243986
('Toy Story (1995)', [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0.77491408934707906, 3.8783185840707963)
('Get Shorty (1995)', [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0.35738831615120276, 3.5502392344497609)


# Now, let's compute the distance between some given test movie (Toy Story, in this example) and all of the movies in our data set. then sort those by distance, and print out the K nearest neighbors.

In [14]:
def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        if (movie != movieID):
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

K = 10
avgRating=0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    print (movieDict[neighbor][0])
    # we can print the average rating also by using the print bellow
    #print movieDict[neighbor][0] + " " + str(movieDict[neighbor][3])
avgRating /= float(K)


Liar Liar (1997)
Aladdin (1992)
Willy Wonka and the Chocolate Factory (1971)
Monty Python and the Holy Grail (1974)
Full Monty, The (1997)
George of the Jungle (1997)
Beavis and Butt-head Do America (1996)
Birdcage, The (1996)
Home Alone (1990)
Aladdin and the King of Thieves (1996)


## 