In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
from pylab import rcParams

import string
import math

In [2]:
# let's define a few functions we need
# first off, to get an avg vector for arbitrary count of vectors
def average_input(args):
    # get number of total inputs
    count = len(args)
    if(count == 0):
        return
    if(count == 1):
        return args[0][3:]
    avg = []
    # we know all input vectors will be of the same length
    # cut out vector values for index, animeID, and title_english
    for i in range(3, len(args[0])):
        isum = sum([vector[i] for vector in args])
        avg.append(isum/count)
    return avg

In [3]:
# now, let's define our distance calculation
def cosine_distance(vector1, vector2):
    # cosTHETA = (v1 dot v2)/(mag_v1 * mag_v2)
    # dot_prod = sum(tup[0] * tup[1] for tup in zip(vector1, vector2))
    dot_prod = np.dot(vector1, vector2)
    mag1 = np.sqrt(np.dot(vector1, vector1))
    mag2 = np.sqrt(np.dot(vector2, vector2))
    return (dot_prod / (mag1 * mag2))

In [4]:
# yeeeet time to implement K-Nearest Neighbors
# note: query is our average vector for the input anime data
def knn(query, data, k):
    neighbois = []
    # time to yeet thru dem dataframe rows
    for index, row in data.iterrows():
        # get list for the current row
        # row includes animeID, title_english cols
        entry = list(row)[2:]
        # get the distance between query and current data point
        distance = cosine_distance(query, entry)
        # append distance and index to list
        neighbois.append((distance, index))
    # time to sort list least to greatest distance or ascending (default)
    neighbois = sorted(neighbois)
    # nearest k-neighbors, by index label (animeID)
    k_nearest_neighbois = [boi[1] for boi in neighbois[:k]]
    # noice
    return k_nearest_neighbois

In [5]:
# time to clean out the input vectors from our data
# aka we can't recommend the animes user gave
def inputs_done_gone(big_boi_data, args):
    # get list of all indices of user input animes
    index_list = [anime[0] for anime in args]
    # remove from data
    clean_boi = big_boi_data.loc[~big_boi_data.index.isin(index_list)]
    return clean_boi

In [6]:
# method to get corresponding animes given anime title(s)??
def get_anime(data, titles):
    animes = []
    # iterate thru given titles
    for title in titles:
        # get all possible entries by title
        poss_anime = data[[title in item for item in data['title_english']]]
        if(len(poss_anime.index) == 0): # no possible animes
            print("Anime \'%s\' not found. Proceeding without \'%s\'."%(title,title))
        elif(len(poss_anime.index) == 1): # only one possible anime
            anime_data = [poss_anime.index] + list(poss_anime.loc[poss_anime.index[0]])
            full_title = anime_data[2] # index 2 is title
            print("Selected anime \'%s\'."%full_title)
            # add anime to list of anime
            animes.append(anime_data)
        else: # 1 or more possible title
            print(poss_anime[["animeID","title_english"]])
            index = input("Please enter indexes from above separated with commas ',' or -1 if absent: \n Ex. enter\'4,2,0\' for indexes 4, 2, and 0  ")
#           while(sum([index == i for i in poss_anime.index]) == 0 and index != -1):
#               index = input("Please enter indexes from above separated with commas ',' or -1 if absent: \n Ex. enter\'4,2,0\' for indexes 4, 2, and 0  ")
            if(index == "-1"):
                print("Anime \'%s\' not found. Proceeding without \'%s\'."%(title,title))
            else:
                indices = index.split(",") # array of index values, as str
                if(len(indices) == 1):
                    index = int(indices[0])
                    anime_data = [index] + list(poss_anime.loc[index])
                    full_title = anime_data[2] # index 2 is title_english
                    print("Selected anime \'%s\'."%full_title)
                    # add anime to list of anime
                    animes.append(anime_data)
                else: # more than one index found
                    for i in indices:
                        index = int(i)
                        anime_data = [index] + list(poss_anime.loc[index])
                        full_title = anime_data[2] # index 2 is title_english
                        print("Selected anime \'%s\'."%full_title)
                        # add anime to list of anime
                        animes.append(anime_data)
    return animes

In [7]:
# let's try it with our data lel
anime_df = pd.read_csv("data/normalized_princ_model_training_data.csv")
anime_df.head()

Unnamed: 0.1,Unnamed: 0,animeID,title_english,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,...,principal component 291,principal component 292,principal component 293,principal component 294,principal component 295,principal component 296,principal component 297,principal component 298,principal component 299,principal component 300
0,0,1,Cowboy Bebop,1.311131,-1.299874,-4.241516,2.813546,3.393908,0.74805,0.997276,...,0.09453,0.030463,0.210674,-0.457683,0.407363,0.830158,0.45825,-0.051177,0.110439,0.195252
1,1,5,Cowboy Bebop: The Movie,-0.877062,-0.658764,-6.071744,-2.503581,0.472272,-1.036241,-1.964439,...,0.474505,-0.156904,0.256763,0.006545,-0.032722,0.191925,-0.107632,0.470467,-0.096228,0.636446
2,2,6,Trigun,-2.9588,0.186702,-1.310995,-1.056926,3.076349,2.560385,-0.645996,...,-0.403138,-0.00012,0.092362,-0.004627,0.338243,0.153195,0.450634,-0.26875,0.343677,-0.431299
3,3,7,Witch Hunter Robin,0.579969,-0.164921,-3.543831,0.887435,-0.469471,1.760493,0.305682,...,0.448201,0.520933,0.287756,0.552286,-0.422581,-1.424618,0.048834,0.433366,-0.493677,-0.000751
4,4,8,Beet the Vandel Buster,-1.146832,1.695643,-5.688726,-0.873938,0.49984,-0.649929,0.549449,...,-0.000909,-0.194762,-0.081286,0.063505,-0.39427,0.371577,0.520885,-0.250255,-0.098058,0.640835


In [8]:
# remove Unnamed: 0 column
anime_df = anime_df.drop(columns=['Unnamed: 0'])
anime_df.head()

Unnamed: 0,animeID,title_english,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,...,principal component 291,principal component 292,principal component 293,principal component 294,principal component 295,principal component 296,principal component 297,principal component 298,principal component 299,principal component 300
0,1,Cowboy Bebop,1.311131,-1.299874,-4.241516,2.813546,3.393908,0.74805,0.997276,-2.66376,...,0.09453,0.030463,0.210674,-0.457683,0.407363,0.830158,0.45825,-0.051177,0.110439,0.195252
1,5,Cowboy Bebop: The Movie,-0.877062,-0.658764,-6.071744,-2.503581,0.472272,-1.036241,-1.964439,0.46638,...,0.474505,-0.156904,0.256763,0.006545,-0.032722,0.191925,-0.107632,0.470467,-0.096228,0.636446
2,6,Trigun,-2.9588,0.186702,-1.310995,-1.056926,3.076349,2.560385,-0.645996,-0.407952,...,-0.403138,-0.00012,0.092362,-0.004627,0.338243,0.153195,0.450634,-0.26875,0.343677,-0.431299
3,7,Witch Hunter Robin,0.579969,-0.164921,-3.543831,0.887435,-0.469471,1.760493,0.305682,-1.53436,...,0.448201,0.520933,0.287756,0.552286,-0.422581,-1.424618,0.048834,0.433366,-0.493677,-0.000751
4,8,Beet the Vandel Buster,-1.146832,1.695643,-5.688726,-0.873938,0.49984,-0.649929,0.549449,0.12864,...,-0.000909,-0.194762,-0.081286,0.063505,-0.39427,0.371577,0.520885,-0.250255,-0.098058,0.640835


In [9]:
# let's get our input animes :D
input_titles = []

while(True):
    title = input("Enter anime name: \n (\'-1\' when done)   ")
    if(title == "-1"):
        break
    input_titles.append(title)

# print input titles
print(input_titles)

Enter anime name: 
 ('-1' when done)   Attack on Titan
Enter anime name: 
 ('-1' when done)   Bungo Stray Dogs
Enter anime name: 
 ('-1' when done)   My Hero Academia
Enter anime name: 
 ('-1' when done)   Nanbaka
Enter anime name: 
 ('-1' when done)   One Punch Man
Enter anime name: 
 ('-1' when done)   -1
['Attack on Titan', 'Bungo Stray Dogs', 'My Hero Academia', 'Nanbaka', 'One Punch Man']


In [10]:
# time to get our anime data from overall training data
my_anime_list = get_anime(anime_df, input_titles)

# print all titles from my_anime_list
print([anime[2] for anime in my_anime_list])

      animeID                           title_english
1810    16498                         Attack on Titan
1916    19285         Attack on Titan: Since That Day
2072    23775  Attack on Titan: Crimson Bow and Arrow
2073    23777       Attack on Titan: Wings of Freedom
2119    25777                Attack on Titan Season 2
2304    31374            Attack on Titan: Junior High
2705    35760                Attack on Titan Season 3
Please enter indexes from above separated with commas ',' or -1 if absent: 
 Ex. enter'4,2,0' for indexes 4, 2, and 0  1810,2119
Selected anime 'Attack on Titan'.
Selected anime 'Attack on Titan Season 2'.
      animeID       title_english
2316    31478    Bungo Stray Dogs
2424    32867  Bungo Stray Dogs 2
Please enter indexes from above separated with commas ',' or -1 if absent: 
 Ex. enter'4,2,0' for indexes 4, 2, and 0  2316
Selected anime 'Bungo Stray Dogs'.
      animeID       title_english
2365    31964    My Hero Academia
2503    33486  My Hero Academia 2

In [11]:
# need to clean our inputs from overall database
clean_anime_df = inputs_done_gone(anime_df, my_anime_list)

clean_anime_df.head()

Unnamed: 0,animeID,title_english,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,...,principal component 291,principal component 292,principal component 293,principal component 294,principal component 295,principal component 296,principal component 297,principal component 298,principal component 299,principal component 300
0,1,Cowboy Bebop,1.311131,-1.299874,-4.241516,2.813546,3.393908,0.74805,0.997276,-2.66376,...,0.09453,0.030463,0.210674,-0.457683,0.407363,0.830158,0.45825,-0.051177,0.110439,0.195252
1,5,Cowboy Bebop: The Movie,-0.877062,-0.658764,-6.071744,-2.503581,0.472272,-1.036241,-1.964439,0.46638,...,0.474505,-0.156904,0.256763,0.006545,-0.032722,0.191925,-0.107632,0.470467,-0.096228,0.636446
2,6,Trigun,-2.9588,0.186702,-1.310995,-1.056926,3.076349,2.560385,-0.645996,-0.407952,...,-0.403138,-0.00012,0.092362,-0.004627,0.338243,0.153195,0.450634,-0.26875,0.343677,-0.431299
3,7,Witch Hunter Robin,0.579969,-0.164921,-3.543831,0.887435,-0.469471,1.760493,0.305682,-1.53436,...,0.448201,0.520933,0.287756,0.552286,-0.422581,-1.424618,0.048834,0.433366,-0.493677,-0.000751
4,8,Beet the Vandel Buster,-1.146832,1.695643,-5.688726,-0.873938,0.49984,-0.649929,0.549449,0.12864,...,-0.000909,-0.194762,-0.081286,0.063505,-0.39427,0.371577,0.520885,-0.250255,-0.098058,0.640835


In [12]:
# now, before we can run KNN, we need to get the average vector of our inputs
avg = average_input(my_anime_list)
print(avg)

[-0.2618716985594127, -0.6223437285404906, -2.7640645182996977, 0.5428536578691263, -0.5229596052780014, 1.0066451033912986, -2.3012066939333726, -1.4465223120429804, 0.218998906746096, 0.10028385543827881, -2.801273084710891, -1.4157446530293591, -0.09573139804967357, -0.4487584234639862, 0.6688288308052882, 0.5816551107348664, 0.5145586424785372, 1.5253865493605248, -1.3692340860569556, -0.7028422402527494, 1.3724199259437762, 0.7696928436867128, -0.1975515667438306, -0.7359342669934961, -0.1338652804134327, -0.13606701782456906, 0.48198974614052575, -0.5875563697743638, 1.0508677821300694, -1.8522454758873785, -0.8279363858652729, -0.16698363658952506, 0.19814182106244355, 1.0252788945514293, -0.7430767055284059, -0.7999083473766089, 0.01795837136957218, -0.5023618590399732, 0.3539644507833638, -0.5425692059823959, -0.4868910734895361, -0.32090080501923046, -0.24654802262629194, 0.8888785431827106, -0.36009024805149237, -1.454975270680281, 0.5090272312764287, -0.24462087389773057, -

In [13]:
# run KNN on our inputs: my_anime_list and k = 7 (for example)
rec_indices = knn(query=avg, data=clean_anime_df, k=7)

print(rec_indices)

[1853, 2062, 704, 1399, 178, 2394, 2627]


In [14]:
# let's see what we were recommended
output_df = pd.read_csv("data/relevant_output_data.csv")
output_df.head()

Unnamed: 0.1,Unnamed: 0,animeID,title_english,synopsis
0,0,1,Cowboy Bebop,"In the year 2071, humanity has colonized sever..."
1,6,5,Cowboy Bebop: The Movie,"Another day, another bounty—such is the life o..."
2,16,6,Trigun,"Vash the Stampede is the man with a $$60,000,0..."
3,22,7,Witch Hunter Robin,Witches are individuals with special powers li...
4,28,8,Beet the Vandel Buster,It is the dark century and the people are suff...


In [15]:
# drop Unnamed: 0 column
output_df = output_df.drop(columns=["Unnamed: 0"])
output_df.head()

Unnamed: 0,animeID,title_english,synopsis
0,1,Cowboy Bebop,"In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: The Movie,"Another day, another bounty—such is the life o..."
2,6,Trigun,"Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,Witches are individuals with special powers li...
4,8,Beet the Vandel Buster,It is the dark century and the people are suff...


In [16]:
# Now let's see the general info about the animes we are recommended
for i in rec_indices:
    details = list(output_df.loc[i])
    print("TITLE: %s"%details[1])
    print("SYNOPSIS: %s"%details[2])

TITLE: Recently, my sister is unusual.
SYNOPSIS: Saikin, Imouto no Yousu ga Chotto Okashiinda ga. follows a family just starting to rebuild. When they marry, Mr. and Mrs. Kanzaki bring a teenage son and daughter along for the ride. But high school freshman Mitsuki Kanzaki is less than thrilled. Stinging from a history of absent and abusive father figures, she is slow to accept her stepfather and stepbrother. But after an accident lands Mitsuki in the hospital, she finds herself possessed by the ghost of Hiyori Kotobuki, a girl her age who was deeply in love with Mitsuki's stepbrother Yuuya. Hiyori cannot pass on to her final reward because of her unrequited love for Yuuya, meaning she's got to consummate it... in Mitsuki's body?! Now, Mitsuki's life depends on getting Hiyori to Heaven. But will she get used to sharing herself with a pushy, amorous ghost? Can she overcome her distrust of her new family? Can she bring herself to fulfill Hiyori's feelings for Yuuya? And might she be hidin