In [1]:
import urllib.request
import pickle, csv, json, os, re
import numpy as np
from asyncore import read
import networkx as nx

from time import sleep
from tqdm import tqdm

We define the functions used to gather data from the jikan APi https://api.jikan.moe

In [None]:
def get_num_pages(link):
    r = urllib.request.urlopen(link)
    text = r.read().decode('utf-8')
    js = json.loads(text)
    last_page = js['pagination']['last_visible_page']
    return last_page


def get_all_pages(page):
    pages = []
    max_pages = get_num_pages('https://api.jikan.moe/v4/{}?page={:.2f}&sfw=true'.format(page,0))
    for i in tqdm(range(max_pages)):
        r = urllib.request.urlopen('https://api.jikan.moe/v4/{}?page={:.2f}&sfw=true'.format(page,i))
        text = r.read().decode('utf-8')
        js = json.loads(text)
        keys = js['data']
        for k in keys:
            pages.append(k)
        sleep(0.75) #The API only allows 3 requests per second, so we have to make sure we don't make more than three requests/sec
    return pages

Now, we extract all anime pages, and save them as text files locally, so they can be re-used

In [None]:
all_animes = get_all_pages('anime')

for anime in all_animes:
    file_name = "{}.txt".format(anime['mal_id'])
    write_file = open("data/animes/"+file_name, "w")
    write_file.write(json.dumps(anime))
    write_file.close()

In [None]:
all_anime_tv = [x for x in all_animes if x['type'] == 'TV']
print(len(all_anime_tv))
all_anime_tv = [x for x in all_anime_tv if x['score']]
print(len(all_anime_tv))
all_anime_tv = [x for x in all_anime_tv if x['synopsis']]
print(len(all_anime_tv))

In [None]:
anime_graph = nx.Graph()
for anime in all_anime_tv:
    anime_graph.add_node(anime['mal_id'])
print(anime_graph)

In [None]:
import time
import requests
from tqdm import tqdm
anime_edges = []
error_codes = []
exceptions = []
for id in tqdm(anime_graph.nodes()):
#for id in anime_graph.nodes():
    try:
        r = requests.get('https://api.jikan.moe/v4/anime/{}/recommendations'.format(id),timeout=10)  
    except Exception as e:
        exceptions.append(e)
    
    if r.status_code == 200:
        #text = r.read().decode('utf-8')
        js = r.json()# = json.loads(r)
        recs = js['data']
        for rec in recs:
            anime_edges.append((id,rec['entry']['mal_id'],rec['votes']))
        time.sleep(1)
    else:
        error_codes.append(r.status_code)
print("Timeouts:",len(exceptions))
print("Too many requests responses:",len(error_codes))

In [None]:
print(len(anime_edges))
anime_nodes = anime_graph.nodes()
anime_edges_filtered = [x for x in anime_edges if x[0] in anime_nodes and x[1] in anime_nodes]
print(len(anime_edges_filtered))
anime_graph.add_weighted_edges_from(anime_edges_filtered)
print(anime_graph)

In [None]:
nx.write_weighted_edgelist(anime_graph,"data/edgeanimelist.txt")