# Main scraping file

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

In [2]:
BASE_URL = "https://myanimelist.net/"
TOP_X_URL = BASE_URL + "topanime.php?limit="
MAX_IDX = 13650
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
HEADERS = {
    'user-agent' : USER_AGENT
    }


In [3]:
# Get information for these fields
columns = ['Synonyms', 'Title', 'Japanese', 'English', 'Type', 'Episodes', 'Status', 'Aired', 'Premiered', 'Broadcast', 'Producers', 'Licensors', 'Studios', 'Source', 'Genres', 'Demographic', 'Duration', 'Rating', 'Score', 'Ranked', 'Popularity', 'Members', 'Favorites']
df = pd.DataFrame(columns=columns, index=None)


In [7]:
def get_page_urls(idx):
    url = TOP_X_URL + str(idx)
    print(url)

    url_list = []
    
    resp = requests.get(url, headers=HEADERS)
    print(resp.status_code)
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.text, "html.parser")
        urls_found = soup.find_all("a", {"class": re.compile("^hoverinfo_trigger$")}) # Search for link to anime page

        url_list = [urls_found[x].attrs['href'] for x in range(0,len(urls_found), 2)] # BeautifulSoup returns each link twice (soup returns even when regex doesn't match), so take every second
        # print(url_list)
        return url_list
    else:
        return None
    
def get_page_info(url):
    try:
        record = {x : None for x in columns} # Store each page information in dict
        resp = requests.get(url, headers=HEADERS)
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.text, "html.parser")

            title_jp = soup.find("h1", {"class": "title-name h1_bold_none"}).text
            print(title_jp)

            record['Title'] = title_jp

            fields = []
            for x in soup.find_all("div", {"class": "spaceit_pad"}):
                
                if x.find("span") is None:
                    break
                
                field = x.find("span").text.replace(":","")
                fields.append(field)
                if field == "Score":
                    info = x.find("span", {"itemprop": "ratingValue"}).text
                elif field == "Ranked": # Ranked field is obtained from top bar, not side bar
                    continue
                else:
                    if x.find_all("a") != []:
                        info = [y.text.replace("\n", "").rstrip().lstrip() for y in x.find_all("a")]

                    if x.find_all("a") == []:
                        info = x.text[x.text.index(":")+2:].replace("\n", "").rstrip().lstrip()

                    if field == "Popularity":
                        info = info.replace("#", "")

                    if field == "Members" or field == "Favorites":
                        if "," in info:
                            info = info.replace(",", "")
                
                record[field] = info

            field = 'Ranked'
            info = soup.find("span", {"class": "numbers ranked"}).find_next().text.replace("#", "") # Get rank from top of page instead of side bar

            record[field] = info
            return record
        else:
            return None
    except:
        return None


## Run scraping

In [None]:
def main():
    df = pd.DataFrame(columns=columns, index=None) # Reset DF to reduce memory usage
    # MAX_IDX = 0
    for i in range(0, MAX_IDX+50, 50):
        urls = get_page_urls(str(i)) # Get urls of all info pages from the main page
        print(urls)
        if not urls is None:
            for url in urls:
                info = get_page_info(url)
                # for x in list(info.keys()):
                df.loc[len(df)] = info # Add page info to DF
        print(df)

        df.to_csv('data.csv', mode='a', header=False, index=False) # Store information after each page of urls
        df = pd.DataFrame(columns=columns, index=None) # Reset DF to reduce memory usage

    # print(df)

main()   

https://myanimelist.net/topanime.php?limit=13650
200
['https://myanimelist.net/anime/6953/Ningen_Doubutsuen', 'https://myanimelist.net/anime/32294/Ai_ONA', 'https://myanimelist.net/anime/32894/Love_Bites', 'https://myanimelist.net/anime/5877/Abunai_Sisters__Koko___Mika', 'https://myanimelist.net/anime/22179/Aki_no_Puzzle', 'https://myanimelist.net/anime/31634/Kokuhaku', 'https://myanimelist.net/anime/413/Hametsu_no_Mars', 'https://myanimelist.net/anime/13405/Utsu_Musume_Sayuri', 'https://myanimelist.net/anime/3287/Tenkuu_Danzai_Skelter_Heaven', 'https://myanimelist.net/anime/54574/Seizoki', 'https://myanimelist.net/anime/35020/Sekai_Jinken_Sengen', 'https://myanimelist.net/anime/52047/Sekai_Meisaku_Video_Ehon', 'https://myanimelist.net/anime/25607/Sekai_no_Fushigi_Tanken_Series', 'https://myanimelist.net/anime/22523/Sekai_no_Hikari__Shinran_Shounin', 'https://myanimelist.net/anime/27379/Sekai_no_Ouja__King_Kong_Taikai', 'https://myanimelist.net/anime/51236/Sekigahara_Gassen__Kiro_ni_Ta

In [None]:
# get_page_urls(0)
test_url = "https://myanimelist.net/anime/52991/Sousou_no_Frieren"
get_page_info(test_url)

Frieren: Beyond Journey's End
Synonyms
Japanese
English
Type
['TV']
Episodes
Status
Aired
Premiered
['Fall 2023']
Broadcast
Producers
['Aniplex', 'Dentsu', 'Shogakukan-Shueisha Productions', 'Nippon Television Network', 'TOHO animation', 'Shogakukan']
Licensors
['Crunchyroll']
Studios
['Madhouse']
Source
['Manga']
Genres
['Adventure', 'Drama', 'Fantasy']
Demographic
['Shounen']
Duration
Rating
Score
Ranked
Popularity
Members
Favorites
{'Synonyms': 'Frieren at the Funeral', 'Japanese': '葬送のフリーレン', 'English': "Frieren: Beyond Journey's End", 'Type': ['TV'], 'Episodes': '28', 'Status': 'Finished Airing', 'Aired': 'Sep 29, 2023 to Mar 22, 2024', 'Premiered': ['Fall 2023'], 'Broadcast': 'Fridays at 23:00 (JST)', 'Producers': ['Aniplex', 'Dentsu', 'Shogakukan-Shueisha Productions', 'Nippon Television Network', 'TOHO animation', 'Shogakukan'], 'Licensors': ['Crunchyroll'], 'Studios': ['Madhouse'], 'Source': ['Manga'], 'Genres': ['Adventure', 'Drama', 'Fantasy'], 'Demographic': ['Shounen'], 'D

{'Synonyms': 'Frieren at the Funeral',
 'Japanese': '葬送のフリーレン',
 'English': "Frieren: Beyond Journey's End",
 'Type': ['TV'],
 'Episodes': '28',
 'Status': 'Finished Airing',
 'Aired': 'Sep 29, 2023 to Mar 22, 2024',
 'Premiered': ['Fall 2023'],
 'Broadcast': 'Fridays at 23:00 (JST)',
 'Producers': ['Aniplex',
  'Dentsu',
  'Shogakukan-Shueisha Productions',
  'Nippon Television Network',
  'TOHO animation',
  'Shogakukan'],
 'Licensors': ['Crunchyroll'],
 'Studios': ['Madhouse'],
 'Source': ['Manga'],
 'Genres': ['Adventure', 'Drama', 'Fantasy'],
 'Demographic': ['Shounen'],
 'Duration': '24 min. per ep.',
 'Rating': 'PG-13 - Teens 13 or older',
 'Score': '9.32',
 'Ranked': '1',
 'Popularity': '185',
 'Members': '946246',
 'Favorites': '56511',
 'Title': 'Sousou no Frieren'}