In [85]:
import collections
from tqdm import tqdm
import statistics
import os
import copy
import math
import numpy as np
import csv

In [86]:
rows = []

anime_id_to_name = {}
anime_name_to_id = {}
anime_id_to_genres = {}

with open('myanimelist/anime.csv', newline='') as f:
    file_rows = csv.reader(f)
    first_row = True
    for row in file_rows:
        if first_row:
            first_row = False
        else:
            anime_id = int(row[0])
            anime_name = row[1]
            anime_genres = set()
            anime_genres_spaces = row[2].split(',')
            for genre in anime_genres_spaces:
                if not genre:
                    continue
                if genre[0] == ' ':
                    anime_genres.add(genre[1:])
                else:
                    anime_genres.add(genre)
                
            
            
            anime_id_to_name[anime_id] = anime_name
            anime_name_to_id[anime_name] = anime_id
            anime_id_to_genres[anime_id] = anime_genres   
            
anime_names = anime_name_to_id.keys()
anime_ids = anime_id_to_name.keys()

In [87]:
print(anime_id_to_name[list(anime_id_to_genres.keys())[0]],list(anime_id_to_genres.items())[0])

Kimi no Na wa. (32281, {'Supernatural', 'Romance', 'Drama', 'School'})


In [88]:
all_genres_counter = collections.Counter([genre for anime_genres in anime_id_to_genres.values() for genre in anime_genres])

all_genres = set()
for s in anime_id_to_genres.values():
    all_genres |= s
    
anime_count = len(list(anime_ids))

In [89]:
print(all_genres_counter)

print()

print(all_genres)

print()

print(anime_count)

Counter({'Comedy': 4645, 'Action': 2845, 'Adventure': 2348, 'Fantasy': 2309, 'Sci-Fi': 2070, 'Drama': 2016, 'Shounen': 1711, 'Kids': 1609, 'Romance': 1464, 'School': 1220, 'Slice of Life': 1220, 'Hentai': 1141, 'Supernatural': 1037, 'Mecha': 944, 'Music': 860, 'Historical': 806, 'Magic': 778, 'Ecchi': 637, 'Shoujo': 603, 'Seinen': 547, 'Sports': 543, 'Mystery': 495, 'Super Power': 465, 'Military': 426, 'Parody': 408, 'Space': 381, 'Horror': 369, 'Harem': 317, 'Demons': 294, 'Martial Arts': 265, 'Dementia': 240, 'Psychological': 229, 'Police': 197, 'Game': 181, 'Samurai': 148, 'Vampire': 102, 'Thriller': 87, 'Cars': 72, 'Shounen Ai': 65, 'Shoujo Ai': 55, 'Josei': 54, 'Yuri': 42, 'Yaoi': 39})

{'Kids', 'Action', 'Yuri', 'Military', 'Shounen', 'Vampire', 'Police', 'Historical', 'Fantasy', 'Dementia', 'Super Power', 'Music', 'Thriller', 'Hentai', 'Supernatural', 'Sci-Fi', 'Parody', 'Cars', 'Comedy', 'School', 'Drama', 'Slice of Life', 'Mecha', 'Space', 'Game', 'Sports', 'Martial Arts', 'Sa

In [90]:
def calculate_tfidf(anime):
    terms = list(anime_id_to_genres[anime])
    terms_len = len(terms)
    
    tfidfd = collections.defaultdict(lambda: 0.0)

    for genre in anime_id_to_genres[anime]:
        tfidfd[genre] = 1.0/terms_len * np.log10(anime_count/all_genres_counter[genre])

    ksum = sum(tfidfd[k] for k in tfidfd.keys())
    for k in tfidfd.keys():
        tfidfd[k] = tfidfd[k]/ksum
        
    return tfidfd

calculate_tfidf(32281)

defaultdict(<function __main__.calculate_tfidf.<locals>.<lambda>()>,
            {'Supernatural': 0.2836089638865728,
             'Romance': 0.2440583866462441,
             'Drama': 0.20736336894041021,
             'School': 0.2649692805267729})

In [91]:
def calculate_cosine(anime1, anime2):
        
    # this is done so that a title is similar to itself even if it doesnt have any tags
    if anime1 == anime2:
        return 1
    
    anime1tfidf = calculate_tfidf(anime1)
    anime2tfidf = calculate_tfidf(anime2)
    
    licznik = 0.0
    for k in set(list(anime1tfidf.keys()) + list(anime2tfidf.keys())):
        licznik += anime1tfidf[k] * anime2tfidf[k]
    
    m1 = sum(anime1tfidf[k]**2.0 for k in anime1tfidf.keys())
    m2 = sum(anime2tfidf[k]**2.0 for k in anime2tfidf.keys())
    
    try:
        res = licznik/(math.sqrt(m1) * math.sqrt(m2))
        if not math.isnan(res):
            return res
        else:
            return -1
    except:
        return -1

In [92]:
atmp = []
for anime in anime_ids:
    atmp.append(( 
        calculate_cosine(
            anime, 
            32281
        ), anime)
    )
    
for e in sorted(atmp, reverse=True):
    print(e)

  res = licznik/(math.sqrt(m1) * math.sqrt(m2))


(1.0000000000000002, 547)
(1.0000000000000002, 546)
(1, 32281)
(0.9762646819216453, 14669)
(0.8920885169212823, 6572)
(0.8920885169212823, 2787)
(0.8920885169212823, 355)
(0.8852788284868801, 32262)
(0.8852788284868801, 26019)
(0.8745052988110293, 20903)
(0.8745052988110293, 10067)
(0.8682551245974519, 20517)
(0.8682551245974519, 18195)
(0.8682551245974519, 16001)
(0.8682551245974519, 11887)
(0.8682551245974519, 2167)
(0.8501304920850693, 2105)
(0.8501304920850693, 1607)
(0.8501304920850693, 1039)
(0.8501304920850693, 713)
(0.8340786765995449, 31716)
(0.8260378704157191, 28725)
(0.8260378704157191, 18053)
(0.8260378704157191, 18045)
(0.8260378704157191, 17585)
(0.8260378704157191, 12175)
(0.8260378704157191, 9988)
(0.8260378704157191, 8481)
(0.8260378704157191, 6351)
(0.8260378704157191, 2927)
(0.8260378704157191, 2926)
(0.8260378704157191, 2179)
(0.8260378704157191, 2129)
(0.8260378704157191, 1624)
(0.8260378704157191, 756)
(0.7983132125787217, 34106)
(0.7983132125787217, 31610)
(0.79

(0.20918780497481718, 31772)
(0.20918780497481718, 31704)
(0.20918780497481718, 30276)
(0.20911201268969934, 28957)
(0.20911201268969934, 24701)
(0.20911201268969934, 24687)
(0.20911201268969934, 21939)
(0.20911201268969934, 21329)
(0.20911201268969934, 457)
(0.20910883570812405, 3005)
(0.20910883570812405, 1099)
(0.20910883570812405, 985)
(0.20910883570812405, 679)
(0.20873027945998684, 30386)
(0.20873027945998684, 18629)
(0.20873027945998684, 9931)
(0.2087091156245896, 4896)
(0.2087091156245896, 1594)
(0.2087091156245896, 228)
(0.20845763432765432, 2262)
(0.20836267348701537, 34451)
(0.20796458391993283, 33964)
(0.20796458391993283, 32998)
(0.20782666614358952, 29758)
(0.20762565978110573, 10739)
(0.20727417987906085, 4531)
(0.20711021117396058, 12917)
(0.20711021117396058, 7252)
(0.20699336537189972, 33539)
(0.20699336537189972, 33538)
(0.20699336537189972, 24085)
(0.2068496115517101, 1328)
(0.20681480899232368, 17157)
(0.2067780639371962, 23317)
(0.20673874109101775, 635)
(0.206728

(0.0, 25991)
(0.0, 25987)
(0.0, 25985)
(0.0, 25983)
(0.0, 25981)
(0.0, 25979)
(0.0, 25977)
(0.0, 25975)
(0.0, 25973)
(0.0, 25971)
(0.0, 25969)
(0.0, 25967)
(0.0, 25965)
(0.0, 25963)
(0.0, 25943)
(0.0, 25941)
(0.0, 25939)
(0.0, 25923)
(0.0, 25921)
(0.0, 25915)
(0.0, 25907)
(0.0, 25897)
(0.0, 25891)
(0.0, 25889)
(0.0, 25877)
(0.0, 25875)
(0.0, 25873)
(0.0, 25871)
(0.0, 25867)
(0.0, 25857)
(0.0, 25809)
(0.0, 25805)
(0.0, 25749)
(0.0, 25731)
(0.0, 25719)
(0.0, 25717)
(0.0, 25689)
(0.0, 25669)
(0.0, 25667)
(0.0, 25661)
(0.0, 25647)
(0.0, 25641)
(0.0, 25639)
(0.0, 25635)
(0.0, 25633)
(0.0, 25627)
(0.0, 25619)
(0.0, 25617)
(0.0, 25615)
(0.0, 25613)
(0.0, 25611)
(0.0, 25609)
(0.0, 25607)
(0.0, 25599)
(0.0, 25597)
(0.0, 25549)
(0.0, 25547)
(0.0, 25545)
(0.0, 25543)
(0.0, 25541)
(0.0, 25533)
(0.0, 25503)
(0.0, 25495)
(0.0, 25491)
(0.0, 25473)
(0.0, 25461)
(0.0, 25457)
(0.0, 25441)
(0.0, 25439)
(0.0, 25431)
(0.0, 25393)
(0.0, 25389)
(0.0, 25383)
(0.0, 25377)
(0.0, 25365)
(0.0, 25363)
(0.0, 25345)

(0.0, 6714)
(0.0, 6713)
(0.0, 6705)
(0.0, 6704)
(0.0, 6702)
(0.0, 6701)
(0.0, 6695)
(0.0, 6694)
(0.0, 6693)
(0.0, 6692)
(0.0, 6691)
(0.0, 6690)
(0.0, 6689)
(0.0, 6688)
(0.0, 6687)
(0.0, 6686)
(0.0, 6685)
(0.0, 6684)
(0.0, 6675)
(0.0, 6674)
(0.0, 6672)
(0.0, 6671)
(0.0, 6670)
(0.0, 6667)
(0.0, 6666)
(0.0, 6658)
(0.0, 6657)
(0.0, 6654)
(0.0, 6641)
(0.0, 6636)
(0.0, 6635)
(0.0, 6634)
(0.0, 6633)
(0.0, 6630)
(0.0, 6629)
(0.0, 6628)
(0.0, 6625)
(0.0, 6622)
(0.0, 6610)
(0.0, 6607)
(0.0, 6604)
(0.0, 6593)
(0.0, 6590)
(0.0, 6587)
(0.0, 6583)
(0.0, 6582)
(0.0, 6581)
(0.0, 6577)
(0.0, 6574)
(0.0, 6573)
(0.0, 6566)
(0.0, 6560)
(0.0, 6558)
(0.0, 6557)
(0.0, 6555)
(0.0, 6554)
(0.0, 6548)
(0.0, 6546)
(0.0, 6533)
(0.0, 6531)
(0.0, 6528)
(0.0, 6527)
(0.0, 6525)
(0.0, 6524)
(0.0, 6523)
(0.0, 6522)
(0.0, 6520)
(0.0, 6519)
(0.0, 6518)
(0.0, 6517)
(0.0, 6513)
(0.0, 6511)
(0.0, 6491)
(0.0, 6484)
(0.0, 6482)
(0.0, 6481)
(0.0, 6472)
(0.0, 6468)
(0.0, 6467)
(0.0, 6465)
(0.0, 6463)
(0.0, 6452)
(0.0, 6448)
(0.0

In [93]:
# idk if it makes sens to have this function, 
# we could maybe merge functions i.e calculate_cosine from here and danbooru notebook
def calculate_series_similarity(series1, series2):
    return calculate_cosine(series1, series2)

In [94]:
# example
series1 = anime_name_to_id['Kimi no Na wa.']
atmp = []
for series2 in tqdm(anime_ids):
    a = calculate_series_similarity(series1, series2)
    atmp.append((a, series2))
    
for w in sorted(atmp, reverse=True):
    print(w)

  res = licznik/(math.sqrt(m1) * math.sqrt(m2))
100%|██████████| 12294/12294 [00:00<00:00, 39124.66it/s]


(1.0000000000000002, 547)
(1.0000000000000002, 546)
(1, 32281)
(0.9762646819216453, 14669)
(0.8920885169212823, 6572)
(0.8920885169212823, 2787)
(0.8920885169212823, 355)
(0.8852788284868801, 32262)
(0.8852788284868801, 26019)
(0.8745052988110293, 20903)
(0.8745052988110293, 10067)
(0.8682551245974519, 20517)
(0.8682551245974519, 18195)
(0.8682551245974519, 16001)
(0.8682551245974519, 11887)
(0.8682551245974519, 2167)
(0.8501304920850693, 2105)
(0.8501304920850693, 1607)
(0.8501304920850693, 1039)
(0.8501304920850693, 713)
(0.8340786765995449, 31716)
(0.8260378704157191, 28725)
(0.8260378704157191, 18053)
(0.8260378704157191, 18045)
(0.8260378704157191, 17585)
(0.8260378704157191, 12175)
(0.8260378704157191, 9988)
(0.8260378704157191, 8481)
(0.8260378704157191, 6351)
(0.8260378704157191, 2927)
(0.8260378704157191, 2926)
(0.8260378704157191, 2179)
(0.8260378704157191, 2129)
(0.8260378704157191, 1624)
(0.8260378704157191, 756)
(0.7983132125787217, 34106)
(0.7983132125787217, 31610)
(0.79

(0.20043794024625256, 2130)
(0.2003074694403923, 3992)
(0.20028853030432536, 3630)
(0.19992196447784671, 10740)
(0.19992196447784671, 9917)
(0.19992196447784671, 2813)
(0.19992196447784671, 1121)
(0.19992196447784671, 1120)
(0.19992196447784671, 1119)
(0.19992196447784671, 1117)
(0.1998959273266102, 32016)
(0.19985756689904913, 32574)
(0.19985756689904913, 30915)
(0.19985756689904913, 24439)
(0.19984601820988726, 18039)
(0.19981686146233996, 18631)
(0.19981686146233996, 3077)
(0.19981686146233996, 1608)
(0.19967010284612188, 1702)
(0.19966127852734533, 10218)
(0.19961210828175058, 3813)
(0.1994889792084234, 11917)
(0.1994889792084234, 9890)
(0.1994889792084234, 7655)
(0.1994889792084234, 6076)
(0.19941594881102773, 1805)
(0.1993971666686472, 30485)
(0.1993971666686472, 4975)
(0.19929743362342778, 916)
(0.19928024691087673, 2000)
(0.19928024691087673, 766)
(0.19887852880828547, 34151)
(0.1988682262271723, 3178)
(0.19884909470208867, 1557)
(0.19862931889801272, 25437)
(0.1986293188980127

(0.0, 23733)
(0.0, 23731)
(0.0, 23729)
(0.0, 23727)
(0.0, 23725)
(0.0, 23723)
(0.0, 23721)
(0.0, 23719)
(0.0, 23713)
(0.0, 23709)
(0.0, 23707)
(0.0, 23703)
(0.0, 23699)
(0.0, 23697)
(0.0, 23679)
(0.0, 23677)
(0.0, 23675)
(0.0, 23665)
(0.0, 23661)
(0.0, 23651)
(0.0, 23647)
(0.0, 23645)
(0.0, 23643)
(0.0, 23641)
(0.0, 23637)
(0.0, 23635)
(0.0, 23633)
(0.0, 23627)
(0.0, 23619)
(0.0, 23617)
(0.0, 23613)
(0.0, 23611)
(0.0, 23609)
(0.0, 23607)
(0.0, 23605)
(0.0, 23597)
(0.0, 23595)
(0.0, 23575)
(0.0, 23569)
(0.0, 23555)
(0.0, 23551)
(0.0, 23539)
(0.0, 23537)
(0.0, 23523)
(0.0, 23519)
(0.0, 23517)
(0.0, 23515)
(0.0, 23511)
(0.0, 23487)
(0.0, 23483)
(0.0, 23479)
(0.0, 23477)
(0.0, 23475)
(0.0, 23459)
(0.0, 23439)
(0.0, 23433)
(0.0, 23427)
(0.0, 23425)
(0.0, 23421)
(0.0, 23409)
(0.0, 23399)
(0.0, 23393)
(0.0, 23387)
(0.0, 23383)
(0.0, 23375)
(0.0, 23369)
(0.0, 23365)
(0.0, 23361)
(0.0, 23359)
(0.0, 23349)
(0.0, 23347)
(0.0, 23345)
(0.0, 23343)
(0.0, 23341)
(0.0, 23333)
(0.0, 23327)
(0.0, 23325)

(0.0, 6841)
(0.0, 6840)
(0.0, 6839)
(0.0, 6838)
(0.0, 6837)
(0.0, 6836)
(0.0, 6835)
(0.0, 6834)
(0.0, 6833)
(0.0, 6830)
(0.0, 6829)
(0.0, 6828)
(0.0, 6827)
(0.0, 6823)
(0.0, 6822)
(0.0, 6809)
(0.0, 6802)
(0.0, 6800)
(0.0, 6798)
(0.0, 6797)
(0.0, 6796)
(0.0, 6795)
(0.0, 6794)
(0.0, 6793)
(0.0, 6792)
(0.0, 6779)
(0.0, 6777)
(0.0, 6772)
(0.0, 6771)
(0.0, 6769)
(0.0, 6768)
(0.0, 6762)
(0.0, 6761)
(0.0, 6760)
(0.0, 6759)
(0.0, 6758)
(0.0, 6749)
(0.0, 6748)
(0.0, 6743)
(0.0, 6741)
(0.0, 6735)
(0.0, 6734)
(0.0, 6731)
(0.0, 6730)
(0.0, 6727)
(0.0, 6721)
(0.0, 6718)
(0.0, 6714)
(0.0, 6713)
(0.0, 6705)
(0.0, 6704)
(0.0, 6702)
(0.0, 6701)
(0.0, 6695)
(0.0, 6694)
(0.0, 6693)
(0.0, 6692)
(0.0, 6691)
(0.0, 6690)
(0.0, 6689)
(0.0, 6688)
(0.0, 6687)
(0.0, 6686)
(0.0, 6685)
(0.0, 6684)
(0.0, 6675)
(0.0, 6674)
(0.0, 6672)
(0.0, 6671)
(0.0, 6670)
(0.0, 6667)
(0.0, 6666)
(0.0, 6658)
(0.0, 6657)
(0.0, 6654)
(0.0, 6641)
(0.0, 6636)
(0.0, 6635)
(0.0, 6634)
(0.0, 6633)
(0.0, 6630)
(0.0, 6629)
(0.0, 6628)
(0.0

In [96]:
# this is being calculated suspiciously fast but okay, we only have at max a few genres per series
for series1 in tqdm(anime_ids):
    for series2 in anime_ids:
        a = calculate_series_similarity(series1, series2)

  res = licznik/(math.sqrt(m1) * math.sqrt(m2))
  0%|          | 8/12294 [00:02<1:14:58,  2.73it/s]


KeyboardInterrupt: 