# Web scrapping multiple tests

In [1]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
#Function to send an HTTP request and parse the HTML content of the page
def fetch_html(url):
    response = requests.get(url)  # Send an HTTP GET request to the URL
    if response.status_code == 200:  # Check if the request was successful (status code 200)
        # Parse the response content with BeautifulSoup and return the parsed HTML
        return BeautifulSoup(response.content, 'lxml')
    else:
        return None

In [3]:
url = "https://myanimelist.net/topanime.php"  # Define the target URL
soup = fetch_html(url)  # Fetch and parse the HTML content from the URL

# Scrapping Shingeki no Kyojin

## Anime Ranking 

In [4]:
soup.find('span',class_="lightLink top-anime-rank-text rank1").text

'1'

## Anime Score

In [5]:
soup.find('span',class_="text on score-label score-9").text

'9.13'

### Anime name

In [6]:
soup.find('div',class_="di-ib clearfix").a.text

'Shingeki no Kyojin: The Final Season - Kanketsu-hen'

### Anime URL

In [7]:
soup.find('div', class_="di-ib clearfix").a['href']

'https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen'

### Number of episodes

In [8]:
soup.find('div',class_="information di-ib mt4").text.split('\n')[1].strip()

'Special (2 eps)'

### Emission date

In [9]:
soup.find('div',class_="information di-ib mt4").text.split('\n')[2].strip()

'Mar 2023 - 2023'

### Number of members

In [10]:
soup.find('div',class_="information di-ib mt4").text.split('\n')[3].strip()

'370,323 members'

# For every anime

Each anime is stored in the "tr class="ranking-list"" object which contains all the above information for each anime

In [11]:
soup.find('tr',class_="ranking-list")

<tr class="ranking-list">
<td class="rank ac" valign="top">
<span class="lightLink top-anime-rank-text rank1">1</span>
</td>
<td class="title al va-t word-break">
<a class="hoverinfo_trigger fl-l ml12 mr8" href="https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen" id="#area51535" rel="#info51535">
<img alt="Anime: Shingeki no Kyojin: The Final Season - Kanketsu-hen" border="0" class="lazyload" data-src="https://cdn.myanimelist.net/r/50x70/images/anime/1279/131078.jpg?s=d6d04a0dcc347ba55e0243b75b0ad5dd" data-srcset="https://cdn.myanimelist.net/r/50x70/images/anime/1279/131078.jpg?s=d6d04a0dcc347ba55e0243b75b0ad5dd 1x, https://cdn.myanimelist.net/r/100x140/images/anime/1279/131078.jpg?s=1b0db37795fa4240d5b66641643f76bb 2x" height="70" width="50"/>
</a>
<div class="detail"><div id="area51535">
<div class="hoverinfo" id="info51535" rel="a51535"></div>
</div>
<div class="di-ib clearfix"><h3 class="hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3"><a href

In [12]:
soup.find('tr',class_="ranking-list").span.text

'1'

In [13]:
# Find the first 'tr' element with class 'ranking-list'
tr_element = soup.find('tr', class_="ranking-list")

# Find all 'span' elements within the 'tr' element
span_elements = tr_element.find_all('span')

In [14]:
span_elements

[<span class="lightLink top-anime-rank-text rank1">1</span>,
 <span class="text on score-label score-9">9.13</span>,
 <span class="text score-label score-na">N/A</span>]

In [15]:
span_elements[0].text

'1'

In [16]:
span_elements[1].text

'9.13'

In [17]:
div_elements = tr_element.find('div', class_="detail")

In [18]:
div_elements

<div class="detail"><div id="area51535">
<div class="hoverinfo" id="info51535" rel="a51535"></div>
</div>
<div class="di-ib clearfix"><h3 class="hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3"><a href="https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen" id="#area51535" rel="#info51535">Shingeki no Kyojin: The Final Season - Kanketsu-hen</a></h3><div class="icon-watch-pv2"><a class="mal-icon ml8 ga-click" href="https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen/video" title="Watch Promotional Video"><i class="malicon malicon-movie-pv"></i></a></div></div><br/><div class="information di-ib mt4">
        Special (2 eps)<br/>
        Mar 2023 - 2023<br/>
        370,323 members
      </div></div>

In [19]:
soup.find('span', class_="lightLink top-anime-rank-text rank1").text

'1'

# First 50 animes

In [20]:
# Create empty lists to store the extracted information
rankings = []
scores = []
titles = []
number_episodes_list = []
emission_dates = []
members_list = []
urls= []


# Loop through each anime element and extract the information
for anime_element in soup.find_all('tr', class_='ranking-list'):
    # getting the span elements inside the element of ranking-list
    span_elements = anime_element.find_all('span')
    ranking = span_elements[0].text
    score = span_elements[1].text
    title = anime_element.find('div',class_="di-ib clearfix").a.text
    number_episodes = anime_element.find('div',class_="information di-ib mt4").text.split('\n')[1].strip()
    emission_date = anime_element.find('div',class_="information di-ib mt4").text.split('\n')[2].strip()
    members = anime_element.find('div', class_="information di-ib mt4").text.split('\n')[3].strip()
    url= anime_element.find('div', class_="di-ib clearfix").a['href']

    # Append the extracted information to the lists
    rankings.append(ranking)
    scores.append(score)
    titles.append(title)
    number_episodes_list.append(number_episodes)
    emission_dates.append(emission_date)
    members_list.append(members)
    urls.append(url)
    

# Create a dictionary with keys as column names and values as the lists of extracted data
data = {
    'ranking': rankings,
    'score': scores,
    'title': titles,
    'number_of_episodes': number_episodes_list,
    'emission_date': emission_dates,
    'number_members': members_list,
    'url': urls
}

# Convert the dictionary to a pandas DataFrame
df = pd.DataFrame(data)

## Cleaning Pandas Dataframe

In [21]:
top_anime = (
    df
        .assign(
            type_of_emission = lambda df_: df_['number_of_episodes'].str.extract(r'(\D+)'),
            emission_type = lambda df_: df_['type_of_emission'].str.replace('(', '', regex=True),
            number_episode = lambda df_: df_['number_of_episodes'].str.extract(r'(\d+)'),
            miembros = lambda df_: df_['number_members'].str.replace('members', '', regex=True),
            members = lambda df_: df_['miembros'].str.replace(',', '', regex=True)
        #end assign
        )
        .drop(columns= ['number_of_episodes','type_of_emission','miembros','number_members'])
    #end preprocessing
    )

In [22]:
# Split the 'emission_date' column into two new columns using the ' - ' delimiter
split_columns = top_anime['emission_date'].str.split(' - ', expand=True)

top_anime = (
    top_anime
        .assign(
            # Assign the new columns to the original DataFrame with the desired names
            first_emission = split_columns[0],
            last_emission  = split_columns[1],
            #ist_emission = lambda df_:df_['first_emission_y'].to_datetime(format='%b %Y')
            #df['last_emission'] = pd.to_datetime(df['last_emission'], format='%b %Y')
        ))


top_anime['first_emission'] = pd.to_datetime(top_anime['first_emission'], format='%b %Y', errors='coerce')
top_anime['last_emission'] = pd.to_datetime(top_anime['last_emission'], format='%b %Y', errors='coerce')




In [23]:
top_anime.head()

Unnamed: 0,ranking,score,title,emission_date,url,emission_type,number_episode,members,first_emission,last_emission
0,1,9.13,Shingeki no Kyojin: The Final Season - Kankets...,Mar 2023 - 2023,https://myanimelist.net/anime/51535/Shingeki_n...,Special,2,370323,2023-03-01,NaT
1,2,9.11,Fullmetal Alchemist: Brotherhood,Apr 2009 - Jul 2010,https://myanimelist.net/anime/5114/Fullmetal_A...,TV,64,3119535,2009-04-01,2010-07-01
2,3,9.08,Bleach: Sennen Kessen-hen,Oct 2022 - Dec 2022,https://myanimelist.net/anime/41467/Bleach__Se...,TV,13,410998,2022-10-01,2022-12-01
3,4,9.08,Steins;Gate,Apr 2011 - Sep 2011,https://myanimelist.net/anime/9253/Steins_Gate,TV,24,2400946,2011-04-01,2011-09-01
4,5,9.07,Gintama°,Apr 2015 - Mar 2016,https://myanimelist.net/anime/28977/Gintama°,TV,51,584045,2015-04-01,2016-03-01


### Adding web scrapping of detailed information

### Example anime scrapping detailed information

In [28]:
url=top_anime['url'][0]
print(url)
soup = fetch_html(url) 

https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen


In [56]:
soup.find_all('div', class_="spaceit_pad")[9].a.text

'MAPPA'

Some anime display more information or less information

In [87]:
div_elements=soup.find_all('div', class_="spaceit_pad")
div_elements[9].span.text

'Studios:'

In [57]:
dark_elements_list=soup.find_all('span', class_="dark_text")
dark_elements_list

[<span class="dark_text">Synonyms:</span>,
 <span class="dark_text">Japanese:</span>,
 <span class="dark_text">English:</span>,
 <span class="dark_text">Type:</span>,
 <span class="dark_text">Episodes:</span>,
 <span class="dark_text">Status:</span>,
 <span class="dark_text">Aired:</span>,
 <span class="dark_text">Producers:</span>,
 <span class="dark_text">Licensors:</span>,
 <span class="dark_text">Studios:</span>,
 <span class="dark_text">Source:</span>,
 <span class="dark_text">Genres:</span>,
 <span class="dark_text">Themes:</span>,
 <span class="dark_text">Demographic:</span>,
 <span class="dark_text">Duration:</span>,
 <span class="dark_text">Rating:</span>,
 <span class="dark_text">Score:</span>,
 <span class="dark_text">Ranked:</span>,
 <span class="dark_text">Popularity:</span>,
 <span class="dark_text">Members:</span>,
 <span class="dark_text">Favorites:</span>]

In [67]:
dark_elements_list[9].text

'Studios:'

In [97]:
for element in div_elements:
    try :
        if element.span.text=="Studios:":
            print(element.a.text)
    except:
        print("this elements has no span attribute")

MAPPA
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute
this elements has no span attribute


for url in top_anime['url']:
    soup = fetch_html(url)  # Fetch and parse the HTML content from the URL
    print(soup)

In [52]:
soup.select('studios')

[]