# Main scraping file

In [15]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

In [None]:
BASE_URL = "https://myanimelist.net/"
TOP_X_URL = BASE_URL + "topanime.php?limit="
MAX_IDX = 13650


In [224]:
# Get information for these fields
columns = ['Synonyms', 'Title', 'Japanese', 'English', 'Type', 'Episodes', 'Status', 'Aired', 'Premiered', 'Broadcast', 'Producers', 'Licensors', 'Studios', 'Source', 'Genres', 'Demographic', 'Duration', 'Rating', 'Score', 'Ranked', 'Popularity', 'Members', 'Favorites']
df = pd.DataFrame(columns=columns, index=None)


In [None]:
def get_page_urls(idx):
    url = TOP_X_URL + str(idx)

    url_list = []

    resp = requests.get(url)
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.text, "html.parser")
        urls_found = soup.find_all("a", {"class": re.compile("^hoverinfo_trigger$")}) # Search for link to anime page

        url_list = [urls_found[x].attrs['href'] for x in range(0,len(urls_found), 2)] # BeautifulSoup returns each link twice (soup returns even when regex doesn't match), so take every second
        # print(url_list)
        return url_list
    else:
        return None
    
def get_page_info(url):
    record = {x : None for x in columns} # Store each page information in dict
    resp = requests.get(url)
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.text, "html.parser")

        title_jp = soup.find("h1", {"class": "title-name h1_bold_none"}).text
        print(title_jp)

        record['Title'] = title_jp

        fields = []
        for x in soup.find_all("div", {"class": "spaceit_pad"}):
            
            if x.find("span") is None:
                break
            
            field = x.find("span").text.replace(":","")
            fields.append(field)
            if field == "Score":
                info = x.find("span", {"itemprop": "ratingValue"}).text
            elif field == "Ranked": # Ranked field is obtained from top bar, not side bar
                continue
            else:
                if x.find_all("a") != []:
                    info = [y.text.replace("\n", "").rstrip().lstrip() for y in x.find_all("a")]

                if x.find_all("a") == []:
                    info = x.text[x.text.index(":")+2:].replace("\n", "").rstrip().lstrip()

                if field == "Popularity":
                    info = info.replace("#", "")

                if field == "Members" or field == "Favorites":
                    if "," in info:
                        info = info.replace(",", "")
            
            record[field] = info

        field = 'Ranked'
        info = soup.find("span", {"class": "numbers ranked"}).find_next().text.replace("#", "") # Get rank from top of page instead of side bar

        record[field] = info
        return record
    else:
        return None


## Run scraping

In [None]:
def main():
    df = pd.DataFrame(columns=columns, index=None) # Reset DF to reduce memory usage
    MAX_IDX = 0
    for i in range(0, MAX_IDX+50, 50):
        urls = get_page_urls(str(i)) # Get urls of all info pages from the main page
        if not urls is None:
            for url in urls:
                info = get_page_info(url)
                # for x in list(info.keys()):
                df.loc[len(df)] = info # Add page info to DF
                print(df)

        df.to_csv('data.csv', mode='a', header=False, index=False) # Store information after each page of urls
        df = pd.DataFrame(columns=columns, index=None) # Reset DF to reduce memory usage

    print(df)

main()   

Unnamed: 0,Synonyms,Title,Japanese,English,Type,Episodes,Status,Aired,Premiered,Broadcast,...,Source,Genres,Demographic,Duration,Rating,Score,Ranked,Popularity,Members,Favorites
0,Frieren at the Funeral,Sousou no Frieren,葬送のフリーレン,Frieren: Beyond Journey's End,['TV'],28,Finished Airing,"Sep 29, 2023 to Mar 22, 2024",['Fall 2023'],Fridays at 23:00 (JST),...,['Manga'],"['Adventure', 'Drama', 'Fantasy']",['Shounen'],24 min. per ep.,PG-13 - Teens 13 or older,9.32,1,185,946246,56510
1,,One Piece Fan Letter,ONE PIECE FAN LETTER,,['TV Special'],1,Finished Airing,"Oct 20, 2024",,,...,['Light novel'],"['Action', 'Adventure', 'Fantasy']",['Shounen'],24 min.,PG-13 - Teens 13 or older,9.14,2,2954,60405,1583
2,"Hagane no Renkinjutsushi: Fullmetal Alchemist,...",Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,Fullmetal Alchemist: Brotherhood,['TV'],64,Finished Airing,"Apr 5, 2009 to Jul 4, 2010",['Spring 2009'],Sundays at 17:00 (JST),...,['Manga'],"['Action', 'Adventure', 'Drama', 'Fantasy']",['Shounen'],24 min. per ep.,R - 17+ (violence & profanity),9.09,3,3,3439112,229576
3,,Steins;Gate,STEINS;GATE,Steins;Gate,['TV'],24,Finished Airing,"Apr 6, 2011 to Sep 14, 2011",['Spring 2011'],Wednesdays at 02:05 (JST),...,Visual novel,"['Drama', 'Sci-Fi', 'Suspense']",,24 min. per ep.,PG-13 - Teens 13 or older,9.07,4,14,2634949,192331
4,,Shingeki no Kyojin Season 3 Part 2,進撃の巨人 Season3 Part.2,Attack on Titan Season 3 Part 2,['TV'],10,Finished Airing,"Apr 29, 2019 to Jul 1, 2019",['Spring 2019'],Mondays at 00:10 (JST),...,['Manga'],"['Action', 'Drama', 'Suspense']",['Shounen'],23 min. per ep.,R - 17+ (violence & profanity),9.05,5,21,2366848,59872
5,Gintama' (2015),Gintama°,銀魂°,Gintama Season 4,['TV'],51,Finished Airing,"Apr 8, 2015 to Mar 30, 2016",['Spring 2015'],Wednesdays at 18:00 (JST),...,['Manga'],"['Action', 'Comedy', 'Sci-Fi']",['Shounen'],24 min. per ep.,PG-13 - Teens 13 or older,9.05,6,340,649087,16889
6,,Gintama: The Final,銀魂 THE FINAL,Gintama: The Very Final,['Movie'],1,Finished Airing,"Jan 8, 2021",,,...,['Manga'],"['Action', 'Comedy', 'Drama', 'Sci-Fi']",['Shounen'],1 hr. 44 min.,PG-13 - Teens 13 or older,9.04,7,1541,159469,4383
7,Gintama (2011),Gintama',銀魂',Gintama Season 2,['TV'],51,Finished Airing,"Apr 4, 2011 to Mar 26, 2012",['Spring 2011'],Mondays at 18:00 (JST),...,['Manga'],"['Action', 'Comedy', 'Sci-Fi']",['Shounen'],24 min. per ep.,PG-13 - Teens 13 or older,9.03,8,399,572897,8254
8,HxH (2011),Hunter x Hunter (2011),HUNTER×HUNTER（ハンター×ハンター）,Hunter x Hunter,['TV'],148,Finished Airing,"Oct 2, 2011 to Sep 24, 2014",['Fall 2011'],Sundays at 10:55 (JST),...,['Manga'],"['Action', 'Adventure', 'Fantasy']",['Shounen'],23 min. per ep.,PG-13 - Teens 13 or older,9.03,9,8,2934383,215456
9,"Gintama' (2012), Gintama' Overdrive, Kintama",Gintama': Enchousen,銀魂' 延長戦,Gintama: Enchousen,['TV'],13,Finished Airing,"Oct 4, 2012 to Mar 28, 2013",['Fall 2012'],Thursdays at 18:00 (JST),...,['Manga'],"['Action', 'Comedy', 'Sci-Fi']",['Shounen'],24 min. per ep.,PG-13 - Teens 13 or older,9.02,10,750,334115,3075


In [212]:
# get_page_urls(0)
test_url = "https://myanimelist.net/anime/52991/Sousou_no_Frieren"
get_page_info(test_url)

Frieren: Beyond Journey's End
Synonyms
Japanese
English
Type
['TV']
Episodes
Status
Aired
Premiered
['Fall 2023']
Broadcast
Producers
['Aniplex', 'Dentsu', 'Shogakukan-Shueisha Productions', 'Nippon Television Network', 'TOHO animation', 'Shogakukan']
Licensors
['Crunchyroll']
Studios
['Madhouse']
Source
['Manga']
Genres
['Adventure', 'Drama', 'Fantasy']
Demographic
['Shounen']
Duration
Rating
Score
Ranked
Popularity
Members
Favorites
{'Synonyms': 'Frieren at the Funeral', 'Japanese': '葬送のフリーレン', 'English': "Frieren: Beyond Journey's End", 'Type': ['TV'], 'Episodes': '28', 'Status': 'Finished Airing', 'Aired': 'Sep 29, 2023 to Mar 22, 2024', 'Premiered': ['Fall 2023'], 'Broadcast': 'Fridays at 23:00 (JST)', 'Producers': ['Aniplex', 'Dentsu', 'Shogakukan-Shueisha Productions', 'Nippon Television Network', 'TOHO animation', 'Shogakukan'], 'Licensors': ['Crunchyroll'], 'Studios': ['Madhouse'], 'Source': ['Manga'], 'Genres': ['Adventure', 'Drama', 'Fantasy'], 'Demographic': ['Shounen'], 'D

{'Synonyms': 'Frieren at the Funeral',
 'Japanese': '葬送のフリーレン',
 'English': "Frieren: Beyond Journey's End",
 'Type': ['TV'],
 'Episodes': '28',
 'Status': 'Finished Airing',
 'Aired': 'Sep 29, 2023 to Mar 22, 2024',
 'Premiered': ['Fall 2023'],
 'Broadcast': 'Fridays at 23:00 (JST)',
 'Producers': ['Aniplex',
  'Dentsu',
  'Shogakukan-Shueisha Productions',
  'Nippon Television Network',
  'TOHO animation',
  'Shogakukan'],
 'Licensors': ['Crunchyroll'],
 'Studios': ['Madhouse'],
 'Source': ['Manga'],
 'Genres': ['Adventure', 'Drama', 'Fantasy'],
 'Demographic': ['Shounen'],
 'Duration': '24 min. per ep.',
 'Rating': 'PG-13 - Teens 13 or older',
 'Score': '9.32',
 'Ranked': '1',
 'Popularity': '185',
 'Members': '946246',
 'Favorites': '56511',
 'Title': 'Sousou no Frieren'}