# Research Questions

1. What are the top 10 highest ranked anime series?
2. Which anime series have the largest member/viewer base (top 10)?
3. What are the top 10 anime series with the most episodes?
4. Is there a correlation between the number of members/viewers and the anime's score rating?

In [28]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

data_set = pd.DataFrame(columns=['rank', 'score', 'name', 'showType', 'numOfEp', 'startDate', 'endDate', 'members'])


In [29]:
def insert_data(data_set, row_data):
    data_set.loc[len(data_set)] = row_data

In [None]:
 #                                    rank score  name  showtype  numOfEp    startDate   endDate     members

def extract_anime_data(anime_data: BeautifulSoup): #-> #[int, float, str, str, int, str, str, int]:
    rank = int(anime_data.find('span', class__='top-anime-rank-text').text.strip())
    score_tag = anime_data.find('span', class_='score-label')
    
    score_text = score_tag.text.strip()
    score = float(score_text) if score_text != 'N/A' else None #if score is N/A, set to None

    name = anime_data.find('h3', class_='anime_ranking_h3').a.text.strip()
    info_lines = anime_data.find('div', class_='information').text.strip().split('\n')

    type_and_eps = info_lines[0]
    showType = type_and_eps.split('(')[0].strip()
    numOfEp = int(type_and_eps.split('(')[1].split()[0])

    date_range = info_lines[1].split('-')
    startDate = date_range[0].strip()
    endDate = date_range[1].strip()

    members_line = info_lines[2].replace('members', '').strip()
    members = int(members_line.replace(',', ''))


    return [rank, score, name, showType, numOfEp, startDate, endDate, members]


In [31]:
def scrap_list_of_anime(page_number):
    url = f"https://myanimelist.net/topanime.php?limit={page_number * 50}"
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    anime_cards = soup.find_all('tr', class_='ranking-list')
    for card in anime_cards:
        insert_data(data_set, extract_anime_data(card))



In [32]:
for i in range(0, 20):
    scrap_list_of_anime(i)
data_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   rank       1000 non-null   int64 
 1   score      1000 non-null   int64 
 2   name       1000 non-null   object
 3   showType   1000 non-null   object
 4   numOfEp    1000 non-null   int64 
 5   startDate  1000 non-null   object
 6   endDate    1000 non-null   object
 7   members    1000 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 70.3+ KB


In [33]:
data_set.to_csv('anime_data.csv')