# Web scrapping multiple tests

In [1]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor

In [2]:
#Function to send an HTTP request and parse the HTML content of the page
def fetch_html(url):
    response = requests.get(url)  # Send an HTTP GET request to the URL
    if response.status_code == 200:  # Check if the request was successful (status code 200)
        # Parse the response content with BeautifulSoup and return the parsed HTML
        return BeautifulSoup(response.content, 'lxml')
    else:
        return None

In [3]:
url = "https://myanimelist.net/topanime.php"  # Define the target URL
soup = fetch_html(url)  # Fetch and parse the HTML content from the URL

# Scrapping Shingeki no Kyojin

## Anime Ranking 

In [4]:
soup.find('span',class_="lightLink top-anime-rank-text rank1").text

'1'

## Anime Score

In [5]:
soup.find('span',class_="text on score-label score-9").text

'9.13'

### Anime name

In [6]:
soup.find('div',class_="di-ib clearfix").a.text

'Shingeki no Kyojin: The Final Season - Kanketsu-hen'

### Anime URL

In [7]:
soup.find('div', class_="di-ib clearfix").a['href']

'https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen'

### Number of episodes

In [8]:
soup.find('div',class_="information di-ib mt4").text.split('\n')[1].strip()

'Special (2 eps)'

### Emission date

In [9]:
soup.find('div',class_="information di-ib mt4").text.split('\n')[2].strip()

'Mar 2023 - 2023'

### Number of members

In [10]:
soup.find('div',class_="information di-ib mt4").text.split('\n')[3].strip()

'370,982 members'

# For every anime

Each anime is stored in the "tr class="ranking-list"" object which contains all the above information for each anime

In [11]:
soup.find('tr',class_="ranking-list")

<tr class="ranking-list">
<td class="rank ac" valign="top">
<span class="lightLink top-anime-rank-text rank1">1</span>
</td>
<td class="title al va-t word-break">
<a class="hoverinfo_trigger fl-l ml12 mr8" href="https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen" id="#area51535" rel="#info51535">
<img alt="Anime: Shingeki no Kyojin: The Final Season - Kanketsu-hen" border="0" class="lazyload" data-src="https://cdn.myanimelist.net/r/50x70/images/anime/1279/131078.jpg?s=d6d04a0dcc347ba55e0243b75b0ad5dd" data-srcset="https://cdn.myanimelist.net/r/50x70/images/anime/1279/131078.jpg?s=d6d04a0dcc347ba55e0243b75b0ad5dd 1x, https://cdn.myanimelist.net/r/100x140/images/anime/1279/131078.jpg?s=1b0db37795fa4240d5b66641643f76bb 2x" height="70" width="50"/>
</a>
<div class="detail"><div id="area51535">
<div class="hoverinfo" id="info51535" rel="a51535"></div>
</div>
<div class="di-ib clearfix"><h3 class="hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3"><a href

In [12]:
soup.find('tr',class_="ranking-list").span.text

'1'

In [13]:
# Find the first 'tr' element with class 'ranking-list'
tr_element = soup.find('tr', class_="ranking-list")

# Find all 'span' elements within the 'tr' element
span_elements = tr_element.find_all('span')

In [14]:
span_elements

[<span class="lightLink top-anime-rank-text rank1">1</span>,
 <span class="text on score-label score-9">9.13</span>,
 <span class="text score-label score-na">N/A</span>]

In [15]:
span_elements[0].text

'1'

In [16]:
span_elements[1].text

'9.13'

In [17]:
div_elements = tr_element.find('div', class_="detail")

In [18]:
div_elements

<div class="detail"><div id="area51535">
<div class="hoverinfo" id="info51535" rel="a51535"></div>
</div>
<div class="di-ib clearfix"><h3 class="hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3"><a href="https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen" id="#area51535" rel="#info51535">Shingeki no Kyojin: The Final Season - Kanketsu-hen</a></h3><div class="icon-watch-pv2"><a class="mal-icon ml8 ga-click" href="https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen/video" title="Watch Promotional Video"><i class="malicon malicon-movie-pv"></i></a></div></div><br/><div class="information di-ib mt4">
        Special (2 eps)<br/>
        Mar 2023 - 2023<br/>
        370,982 members
      </div></div>

In [19]:
soup.find('span', class_="lightLink top-anime-rank-text rank1").text

'1'

# First 50 animes

In [20]:
# Create empty lists to store the extracted information
rankings = []
scores = []
titles = []
number_episodes_list = []
emission_dates = []
members_list = []
urls= []


# Loop through each anime element and extract the information
for anime_element in soup.find_all('tr', class_='ranking-list'):
    # getting the span elements inside the element of ranking-list
    span_elements = anime_element.find_all('span')
    ranking = span_elements[0].text
    score = span_elements[1].text
    title = anime_element.find('div',class_="di-ib clearfix").a.text
    number_episodes = anime_element.find('div',class_="information di-ib mt4").text.split('\n')[1].strip()
    emission_date = anime_element.find('div',class_="information di-ib mt4").text.split('\n')[2].strip()
    members = anime_element.find('div', class_="information di-ib mt4").text.split('\n')[3].strip()
    url= anime_element.find('div', class_="di-ib clearfix").a['href']

    # Append the extracted information to the lists
    rankings.append(ranking)
    scores.append(score)
    titles.append(title)
    number_episodes_list.append(number_episodes)
    emission_dates.append(emission_date)
    members_list.append(members)
    urls.append(url)
    

# Create a dictionary with keys as column names and values as the lists of extracted data
data = {
    'ranking': rankings,
    'score': scores,
    'title': titles,
    'number_of_episodes': number_episodes_list,
    'emission_date': emission_dates,
    'number_members': members_list,
    'url': urls
}

# Convert the dictionary to a pandas DataFrame
df = pd.DataFrame(data)

## Cleaning Pandas Dataframe

In [21]:
top_anime = (
    df
        .assign(
            type_of_emission = lambda df_: df_['number_of_episodes'].str.extract(r'(\D+)'),
            emission_type = lambda df_: df_['type_of_emission'].str.replace('(', '', regex=True),
            number_episode = lambda df_: df_['number_of_episodes'].str.extract(r'(\d+)'),
            miembros = lambda df_: df_['number_members'].str.replace('members', '', regex=True),
            members = lambda df_: df_['miembros'].str.replace(',', '', regex=True)
        #end assign
        )
        .drop(columns= ['number_of_episodes','type_of_emission','miembros','number_members'])
    #end preprocessing
    )

In [22]:
# Split the 'emission_date' column into two new columns using the ' - ' delimiter
split_columns = top_anime['emission_date'].str.split(' - ', expand=True)

top_anime = (
    top_anime
        .assign(
            # Assign the new columns to the original DataFrame with the desired names
            first_emission = split_columns[0],
            last_emission  = split_columns[1],
            #ist_emission = lambda df_:df_['first_emission_y'].to_datetime(format='%b %Y')
            #df['last_emission'] = pd.to_datetime(df['last_emission'], format='%b %Y')
        ))


top_anime['first_emission'] = pd.to_datetime(top_anime['first_emission'], format='%b %Y', errors='coerce')
top_anime['last_emission'] = pd.to_datetime(top_anime['last_emission'], format='%b %Y', errors='coerce')




In [23]:
top_anime.head()

Unnamed: 0,ranking,score,title,emission_date,url,emission_type,number_episode,members,first_emission,last_emission
0,1,9.13,Shingeki no Kyojin: The Final Season - Kankets...,Mar 2023 - 2023,https://myanimelist.net/anime/51535/Shingeki_n...,Special,2,370982,2023-03-01,NaT
1,2,9.11,Fullmetal Alchemist: Brotherhood,Apr 2009 - Jul 2010,https://myanimelist.net/anime/5114/Fullmetal_A...,TV,64,3120078,2009-04-01,2010-07-01
2,3,9.08,Bleach: Sennen Kessen-hen,Oct 2022 - Dec 2022,https://myanimelist.net/anime/41467/Bleach__Se...,TV,13,410998,2022-10-01,2022-12-01
3,4,9.08,Steins;Gate,Apr 2011 - Sep 2011,https://myanimelist.net/anime/9253/Steins_Gate,TV,24,2401432,2011-04-01,2011-09-01
4,5,9.07,Gintama°,Apr 2015 - Mar 2016,https://myanimelist.net/anime/28977/Gintama°,TV,51,584156,2015-04-01,2016-03-01


### Adding web scrapping of detailed information

### Example anime scrapping detailed information

In [24]:
url=top_anime['url'][0]
print(url)
soup = fetch_html(url) 

https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen


Some anime display more information or less information

In [25]:
div_elements=soup.find_all('div', class_="spaceit_pad")

## anime Studio 

In [26]:
# look for every div element with class spaceit_pad in the soup object
div_elements=soup.find_all('div', class_="spaceit_pad")
for element in div_elements:
    try :
        # if the div element with class spaceit_pad cotaints a span.text object that is "Studios:"
        if element.span.text=="Studios:":
            #print the studio
            print(element.a.text)
    except:
        pass

MAPPA


## Anime Genre

In [27]:
# Look for every div element with class spaceit_pad in the soup object
div_elements = soup.find_all('div', class_="spaceit_pad")

for element in div_elements:
    try:
        # If the div element with class spaceit_pad contains a span.text object that is "Genres:"
        if element.span.text == "Genre:" or element.span.text == "Genres:":
            # Extract all genres by iterating through the anchor tags
            genres = [a.text for a in element.find_all('a')]
            # Combine the genres into a single string
            genres_str = ', '.join(genres)
            print(genres_str)
    except:
        pass

Action, Drama, Suspense


## Anime Theme

In [28]:
# Look for every div element with class spaceit_pad in the soup object
div_elements = soup.find_all('div', class_="spaceit_pad")

for element in div_elements:
    try:
        # If the div element with class spaceit_pad contains a span.text object that is "Genres:"
        if element.span.text == "Theme:" or element.span.text == "Themes:":
            # Extract all genres by iterating through the anchor tags
            themes = [a.text for a in element.find_all('a')]
            # Combine the genres into a single string
            themes_str = ', '.join(themes)
            print(themes_str)
    except:
        pass

Gore, Military, Survival


## Anime Demographic

In [29]:
# Look for every div element with class spaceit_pad in the soup object
div_elements = soup.find_all('div', class_="spaceit_pad")

for element in div_elements:
    try:
        # If the div element with class spaceit_pad contains a span.text object that is "Genres:"
        if element.span.text == "Demographic:" or element.span.text == "Demographics:":
            # Extract all genres by iterating through the anchor tags
            demos = [a.text for a in element.find_all('a')]
            # Combine the genres into a single string
            demos_str = ', '.join(demos)
            print(demos_str)
    except:
        pass

Shounen


### for every anime in the page

#### function to extract the studio

In [30]:
def extract_studio(soup):
    element = soup.find('div', class_="spaceit_pad")
    while element:
        try:
            if element.span.text == "Studios:":
                return element.a.text
        except:
            pass
        element = element.find_next_sibling('div', class_="spaceit_pad")

    return None  # Return None if no studio information is found

In [31]:
extract_studio(soup)

'MAPPA'

In [32]:
for url in top_anime['url']:
    soup = fetch_html(url)  # Fetch and parse the HTML content from the URL
    div_elements=soup.find_all('div', class_="spaceit_pad")
    for element in div_elements:
        try :
        # if the div element with class spaceit_pad cotaints a span.text object that is "Studios:"
            if element.span.text=="Studios:":
            #print the studio
                print(element.a.text)
        except:
            pass

MAPPA
Bones
Pierrot
White Fox
Bandai Namco Pictures
A-1 Pictures
Wit Studio
Bandai Namco Pictures
Sunrise
Madhouse
K-Factory
Sunrise
TMS Entertainment
Bandai Namco Pictures
Shaft
Sunrise
Kyoto Animation
Kyoto Animation
Sunrise
Sunrise
Kyoto Animation
A-1 Pictures
CloverWorks
Bandai Namco Pictures
Shaft
Madhouse
CoMix Wave Films
Bandai Namco Pictures
ufotable
Pierrot
Bones
MAPPA
Shaft
Studio Ghibli
Production I.G
Shaft
MAPPA
Madhouse
Sunrise
Pierrot
Studio LAN
Bones
Artland
Studio Deen
Wit Studio
A-1 Pictures
Tokyo Movie Shinsha
Studio Deen
Artland
Studio Bind


In [33]:
def get_studio(url):
    soup = fetch_html(url)
    div_elements = soup.find_all('div', class_="spaceit_pad")
    studio = None
    for element in div_elements:
        try:
            if element.span.text == "Studios:":
                studio = element.a.text
                break
        except:
            pass
    return studio

In [34]:
def get_studio_themes_genres_demographics(url):
    soup = fetch_html(url)
    div_elements = soup.find_all('div', class_="spaceit_pad")
    
    studio = None
    themes_str = None
    genres_str = None
    demos_str = None
    
    for element in div_elements:
        try:
            if element.span.text == "Studios:":
                studio = element.a.text
            elif element.span.text == "Theme:" or element.span.text == "Themes:":
                themes = [a.text for a in element.find_all('a')]
                themes_str = ', '.join(themes)
            elif element.span.text == "Genre:" or element.span.text == "Genres:":
                genres = [a.text for a in element.find_all('a')]
                genres_str = ', '.join(genres)
            elif element.span.text == "Demographic:" or element.span.text == "Demographics:":
                demos = [a.text for a in element.find_all('a')]
                demos_str = ', '.join(demos)
        except:
            pass
    return studio, themes_str, genres_str, demos_str

In [35]:
top_anime[['studio', 'themes', 'genres', 'demographics']] = top_anime['url'].apply(lambda url: pd.Series(get_studio_themes_genres_demographics(url)))

In [36]:
top_anime

Unnamed: 0,ranking,score,title,emission_date,url,emission_type,number_episode,members,first_emission,last_emission,studio,themes,genres,demographics
0,1,9.13,Shingeki no Kyojin: The Final Season - Kankets...,Mar 2023 - 2023,https://myanimelist.net/anime/51535/Shingeki_n...,Special,2,370982,2023-03-01,NaT,MAPPA,"Gore, Military, Survival","Action, Drama, Suspense",Shounen
1,2,9.11,Fullmetal Alchemist: Brotherhood,Apr 2009 - Jul 2010,https://myanimelist.net/anime/5114/Fullmetal_A...,TV,64,3120078,2009-04-01,2010-07-01,Bones,Military,"Action, Adventure, Drama, Fantasy",Shounen
2,3,9.08,Bleach: Sennen Kessen-hen,Oct 2022 - Dec 2022,https://myanimelist.net/anime/41467/Bleach__Se...,TV,13,410998,2022-10-01,2022-12-01,Pierrot,,"Action, Adventure, Fantasy",Shounen
3,4,9.08,Steins;Gate,Apr 2011 - Sep 2011,https://myanimelist.net/anime/9253/Steins_Gate,TV,24,2401432,2011-04-01,2011-09-01,White Fox,"Psychological, Time Travel","Drama, Sci-Fi, Suspense",
4,5,9.07,Gintama°,Apr 2015 - Mar 2016,https://myanimelist.net/anime/28977/Gintama°,TV,51,584156,2015-04-01,2016-03-01,Bandai Namco Pictures,"Gag Humor, Historical, Parody, Samurai","Action, Comedy, Sci-Fi",Shounen
5,6,9.06,Kaguya-sama wa Kokurasetai: Ultra Romantic,Apr 2022 - Jun 2022,https://myanimelist.net/anime/43608/Kaguya-sam...,TV,13,780691,2022-04-01,2022-06-01,A-1 Pictures,School,"Comedy, Romance",Seinen
6,7,9.06,Shingeki no Kyojin Season 3 Part 2,Apr 2019 - Jul 2019,https://myanimelist.net/anime/38524/Shingeki_n...,TV,10,2055884,2019-04-01,2019-07-01,Wit Studio,"Gore, Military, Survival","Action, Drama",Shounen
7,8,9.05,Gintama: The Final,Jan 2021 - Jan 2021,https://myanimelist.net/anime/39486/Gintama__T...,Movie,1,127620,2021-01-01,2021-01-01,Bandai Namco Pictures,"Gag Humor, Historical, Parody, Samurai","Action, Comedy, Drama, Sci-Fi",Shounen
8,9,9.04,Gintama',Apr 2011 - Mar 2012,https://myanimelist.net/anime/9969/Gintama,TV,51,515498,2011-04-01,2012-03-01,Sunrise,"Gag Humor, Historical, Parody, Samurai","Action, Comedy, Sci-Fi",Shounen
9,10,9.04,Hunter x Hunter (2011),Oct 2011 - Sep 2014,https://myanimelist.net/anime/11061/Hunter_x_H...,TV,148,2607692,2011-10-01,2014-09-01,Madhouse,,"Action, Adventure, Fantasy",Shounen


In [37]:
def get_studio_themes_genres_demographics(url):
    soup = fetch_html(url)
    div_elements = soup.find_all('div', class_="spaceit_pad")
    
    studio = None
    themes_str = None
    genres_str = None
    demos_str = None
    
    for element in div_elements:
        try:
            if element.span.text == "Studios:":
                studio = element.a.text
            elif element.span.text == "Theme:" or element.span.text == "Themes:":
                themes = [a.text for a in element.find_all('a')]
                themes_str = ', '.join(themes)
            elif element.span.text == "Genre:" or element.span.text == "Genres:":
                genres = [a.text for a in element.find_all('a')]
                genres_str = ', '.join(genres)
            elif element.span.text == "Demographic:" or element.span.text == "Demographics:":
                demos = [a.text for a in element.find_all('a')]
                demos_str = ', '.join(demos)
        except:
            pass
    return studio, themes_str, genres_str, demos_str

In [38]:
def process_url(url):
    return pd.Series(get_studio_themes_genres_demographics(url))

In [39]:
# Assuming your DataFrame is named 'top_anime'
urls = top_anime['url'].tolist()

In [40]:
with ThreadPoolExecutor() as executor:
    results = list(executor.map(process_url, urls))

In [41]:
top_anime[['studio', 'themes', 'genres', 'demographics']] = pd.DataFrame(results)

In [42]:
top_anime

Unnamed: 0,ranking,score,title,emission_date,url,emission_type,number_episode,members,first_emission,last_emission,studio,themes,genres,demographics
0,1,9.13,Shingeki no Kyojin: The Final Season - Kankets...,Mar 2023 - 2023,https://myanimelist.net/anime/51535/Shingeki_n...,Special,2,370982,2023-03-01,NaT,MAPPA,"Gore, Military, Survival","Action, Drama, Suspense",Shounen
1,2,9.11,Fullmetal Alchemist: Brotherhood,Apr 2009 - Jul 2010,https://myanimelist.net/anime/5114/Fullmetal_A...,TV,64,3120078,2009-04-01,2010-07-01,Bones,Military,"Action, Adventure, Drama, Fantasy",Shounen
2,3,9.08,Bleach: Sennen Kessen-hen,Oct 2022 - Dec 2022,https://myanimelist.net/anime/41467/Bleach__Se...,TV,13,410998,2022-10-01,2022-12-01,Pierrot,,"Action, Adventure, Fantasy",Shounen
3,4,9.08,Steins;Gate,Apr 2011 - Sep 2011,https://myanimelist.net/anime/9253/Steins_Gate,TV,24,2401432,2011-04-01,2011-09-01,White Fox,"Psychological, Time Travel","Drama, Sci-Fi, Suspense",
4,5,9.07,Gintama°,Apr 2015 - Mar 2016,https://myanimelist.net/anime/28977/Gintama°,TV,51,584156,2015-04-01,2016-03-01,Bandai Namco Pictures,"Gag Humor, Historical, Parody, Samurai","Action, Comedy, Sci-Fi",Shounen
5,6,9.06,Kaguya-sama wa Kokurasetai: Ultra Romantic,Apr 2022 - Jun 2022,https://myanimelist.net/anime/43608/Kaguya-sam...,TV,13,780691,2022-04-01,2022-06-01,A-1 Pictures,School,"Comedy, Romance",Seinen
6,7,9.06,Shingeki no Kyojin Season 3 Part 2,Apr 2019 - Jul 2019,https://myanimelist.net/anime/38524/Shingeki_n...,TV,10,2055884,2019-04-01,2019-07-01,Wit Studio,"Gore, Military, Survival","Action, Drama",Shounen
7,8,9.05,Gintama: The Final,Jan 2021 - Jan 2021,https://myanimelist.net/anime/39486/Gintama__T...,Movie,1,127620,2021-01-01,2021-01-01,Bandai Namco Pictures,"Gag Humor, Historical, Parody, Samurai","Action, Comedy, Drama, Sci-Fi",Shounen
8,9,9.04,Gintama',Apr 2011 - Mar 2012,https://myanimelist.net/anime/9969/Gintama,TV,51,515498,2011-04-01,2012-03-01,Sunrise,"Gag Humor, Historical, Parody, Samurai","Action, Comedy, Sci-Fi",Shounen
9,10,9.04,Hunter x Hunter (2011),Oct 2011 - Sep 2014,https://myanimelist.net/anime/11061/Hunter_x_H...,TV,148,2607692,2011-10-01,2014-09-01,Madhouse,,"Action, Adventure, Fantasy",Shounen
