# Web scrapping multiple tests

In [32]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
#Function to send an HTTP request and parse the HTML content of the page
def fetch_html(url):
    response = requests.get(url)  # Send an HTTP GET request to the URL
    if response.status_code == 200:  # Check if the request was successful (status code 200)
        # Parse the response content with BeautifulSoup and return the parsed HTML
        return BeautifulSoup(response.content, 'lxml')
    else:
        return None

In [3]:
url = "https://myanimelist.net/topanime.php"  # Define the target URL
soup = fetch_html(url)  # Fetch and parse the HTML content from the URL

# Scrapping Shingeki no Kyojin

## Anime Ranking 

In [4]:
soup.find('span',class_="lightLink top-anime-rank-text rank1").text

'1'

## Anime Score

In [5]:
soup.find('span',class_="text on score-label score-9").text

'9.13'

### Anime name

In [6]:
soup.find('div',class_="di-ib clearfix").a.text

'Shingeki no Kyojin: The Final Season - Kanketsu-hen'

### Anime URL

In [7]:
soup.find('div', class_="di-ib clearfix").a['href']

'https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen'

### Number of episodes

In [8]:
soup.find('div',class_="information di-ib mt4").text.split('\n')[1].strip()

'Special (2 eps)'

### Emission date

In [9]:
soup.find('div',class_="information di-ib mt4").text.split('\n')[2].strip()

'Mar 2023 - 2023'

### Number of members

In [10]:
soup.find('div',class_="information di-ib mt4").text.split('\n')[3].strip()

'370,241 members'

# For every anime

Each anime is stored in the "tr class="ranking-list"" object which contains all the above information for each anime

In [11]:
soup.find('tr',class_="ranking-list")

<tr class="ranking-list">
<td class="rank ac" valign="top">
<span class="lightLink top-anime-rank-text rank1">1</span>
</td>
<td class="title al va-t word-break">
<a class="hoverinfo_trigger fl-l ml12 mr8" href="https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen" id="#area51535" rel="#info51535">
<img alt="Anime: Shingeki no Kyojin: The Final Season - Kanketsu-hen" border="0" class="lazyload" data-src="https://cdn.myanimelist.net/r/50x70/images/anime/1977/134922.jpg?s=b3e305320d13cee64ee64de190a7264d" data-srcset="https://cdn.myanimelist.net/r/50x70/images/anime/1977/134922.jpg?s=b3e305320d13cee64ee64de190a7264d 1x, https://cdn.myanimelist.net/r/100x140/images/anime/1977/134922.jpg?s=fbc7c2075557045d8afb2de577e59d62 2x" height="70" width="50"/>
</a>
<div class="detail"><div id="area51535">
<div class="hoverinfo" id="info51535" rel="a51535"></div>
</div>
<div class="di-ib clearfix"><h3 class="hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3"><a href

In [12]:
soup.find('tr',class_="ranking-list").span.text

'1'

In [13]:
# Find the first 'tr' element with class 'ranking-list'
tr_element = soup.find('tr', class_="ranking-list")

# Find all 'span' elements within the 'tr' element
span_elements = tr_element.find_all('span')

In [14]:
span_elements

[<span class="lightLink top-anime-rank-text rank1">1</span>,
 <span class="text on score-label score-9">9.13</span>,
 <span class="text score-label score-na">N/A</span>]

In [15]:
span_elements[0].text

'1'

In [16]:
span_elements[1].text

'9.13'

In [17]:
div_elements = tr_element.find('div', class_="detail")

In [18]:
div_elements

<div class="detail"><div id="area51535">
<div class="hoverinfo" id="info51535" rel="a51535"></div>
</div>
<div class="di-ib clearfix"><h3 class="hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3"><a href="https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen" id="#area51535" rel="#info51535">Shingeki no Kyojin: The Final Season - Kanketsu-hen</a></h3><div class="icon-watch-pv2"><a class="mal-icon ml8 ga-click" href="https://myanimelist.net/anime/51535/Shingeki_no_Kyojin__The_Final_Season_-_Kanketsu-hen/video" title="Watch Promotional Video"><i class="malicon malicon-movie-pv"></i></a></div></div><br/><div class="information di-ib mt4">
        Special (2 eps)<br/>
        Mar 2023 - 2023<br/>
        370,241 members
      </div></div>

In [23]:
soup.find('span', class_="lightLink top-anime-rank-text rank1").text

'1'

# First 50 animes

In [89]:
# Create empty lists to store the extracted information
rankings = []
scores = []
titles = []
number_episodes_list = []
emission_dates = []
members_list = []
urls= []


# Loop through each anime element and extract the information
for anime_element in soup.find_all('tr', class_='ranking-list'):
    # getting the span elements inside the element of ranking-list
    span_elements = anime_element.find_all('span')
    ranking = span_elements[0].text
    score = span_elements[1].text
    title = anime_element.find('div',class_="di-ib clearfix").a.text
    number_episodes = anime_element.find('div',class_="information di-ib mt4").text.split('\n')[1].strip()
    emission_date = anime_element.find('div',class_="information di-ib mt4").text.split('\n')[2].strip()
    members = anime_element.find('div', class_="information di-ib mt4").text.split('\n')[3].strip()
    url= anime_element.find('div', class_="di-ib clearfix").a['href']

    # Append the extracted information to the lists
    rankings.append(ranking)
    scores.append(score)
    titles.append(title)
    number_episodes_list.append(number_episodes)
    emission_dates.append(emission_date)
    members_list.append(members)
    urls.append(url)
    

# Create a dictionary with keys as column names and values as the lists of extracted data
data = {
    'ranking': rankings,
    'score': scores,
    'title': titles,
    'number_of_episodes': number_episodes_list,
    'emission_date': emission_dates,
    'number_members': members_list,
    'url': urls
}

# Convert the dictionary to a pandas DataFrame
df = pd.DataFrame(data)

## Cleaning Pandas Dataframe

In [99]:
top_anime = (
    df
        .assign(
            type_of_emission = lambda df_: df_['number_of_episodes'].str.extract(r'(\D+)'),
            emission_type = lambda df_: df_['type_of_emission'].str.replace('(', '', regex=True),
            number_episode = lambda df_: df_['number_of_episodes'].str.extract(r'(\d+)'),
            miembros = lambda df_: df_['number_members'].str.replace('members', '', regex=True),
            members = lambda df_: df_['miembros'].str.replace(',', '', regex=True)
        #end assign
        )
        .drop(columns= ['number_of_episodes','type_of_emission','miembros','number_members'])
    #end preprocessing
    )

In [113]:
# Split the 'emission_date' column into two new columns using the ' - ' delimiter
split_columns = top_anime['emission_date'].str.split(' - ', expand=True)

top_anime = (
    top_anime
        .assign(
            # Assign the new columns to the original DataFrame with the desired names
            first_emission = split_columns[0],
            last_emission  = split_columns[1],
            #ist_emission = lambda df_:df_['first_emission_y'].to_datetime(format='%b %Y')
            #df['last_emission'] = pd.to_datetime(df['last_emission'], format='%b %Y')
        ))


top_anime['first_emission'] = pd.to_datetime(top_anime['first_emission'], format='%b %Y', errors='coerce')
top_anime['last_emission'] = pd.to_datetime(top_anime['last_emission'], format='%b %Y', errors='coerce')




In [114]:
top_anime

Unnamed: 0,ranking,score,title,emission_date,url,emission_type,number_episode,members,first_emission_y,last_emission,first_emission
0,1,9.13,Shingeki no Kyojin: The Final Season - Kankets...,Mar 2023 - 2023,https://myanimelist.net/anime/51535/Shingeki_n...,Special,2,370241,Mar 2023,NaT,2023-03-01
1,2,9.11,Fullmetal Alchemist: Brotherhood,Apr 2009 - Jul 2010,https://myanimelist.net/anime/5114/Fullmetal_A...,TV,64,3119535,Apr 2009,2010-07-01,2009-04-01
2,3,9.08,Bleach: Sennen Kessen-hen,Oct 2022 - Dec 2022,https://myanimelist.net/anime/41467/Bleach__Se...,TV,13,410998,Oct 2022,2022-12-01,2022-10-01
3,4,9.08,Steins;Gate,Apr 2011 - Sep 2011,https://myanimelist.net/anime/9253/Steins_Gate,TV,24,2400946,Apr 2011,2011-09-01,2011-04-01
4,5,9.07,Gintama°,Apr 2015 - Mar 2016,https://myanimelist.net/anime/28977/Gintama°,TV,51,584045,Apr 2015,2016-03-01,2015-04-01
5,6,9.06,Kaguya-sama wa Kokurasetai: Ultra Romantic,Apr 2022 - Jun 2022,https://myanimelist.net/anime/43608/Kaguya-sam...,TV,13,780136,Apr 2022,2022-06-01,2022-04-01
6,7,9.06,Shingeki no Kyojin Season 3 Part 2,Apr 2019 - Jul 2019,https://myanimelist.net/anime/38524/Shingeki_n...,TV,10,2055884,Apr 2019,2019-07-01,2019-04-01
7,8,9.05,Gintama: The Final,Jan 2021 - Jan 2021,https://myanimelist.net/anime/39486/Gintama__T...,Movie,1,127620,Jan 2021,2021-01-01,2021-01-01
8,9,9.04,Gintama',Apr 2011 - Mar 2012,https://myanimelist.net/anime/9969/Gintama,TV,51,515388,Apr 2011,2012-03-01,2011-04-01
9,10,9.04,Hunter x Hunter (2011),Oct 2011 - Sep 2014,https://myanimelist.net/anime/11061/Hunter_x_H...,TV,148,2607039,Oct 2011,2014-09-01,2011-10-01
