In [40]:
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import json

In [41]:
def soup(driver):
    sleep(2)
    print("soup")
    if driver.current_url != 'https://anilist.co/404':
        try:
            popup_html = driver.page_source

            # Parse the HTML content with BeautifulSoup
            popup_soup = BeautifulSoup(popup_html, 'html.parser')

            ## Scraping relevant information from the popup using BeautifulSoup
            # Find all <a> tags with the desired class
            genre_tags = popup_soup.find_all('a', class_='el-tooltip name')

            # Find all <div> tags with the class "rank"
            rank_tags = popup_soup.find_all('div', class_='rank')

            # Extract the title of the anime
            title = popup_soup.find("h1").get_text(strip=True)

            # Extract the description/synopsis
            des = popup_soup.find("p", class_="description content-wrap").get_text(strip=True)

            if title != None and des != None and  genre_tags != None and  rank_tags != None:
                # make list to save the info
                Genre = []
                rank = []

                # Extract and print the text content of each <a> tag
                for tag in genre_tags:
                    genre = tag.get_text(strip=True)
                    Genre.append(genre)

                # Extract and print the text content of rank tags
                for rank_tag in rank_tags:
                    rank_text = rank_tag.get_text(strip=True)
                    rank.append(rank_text)
                genres = dict(zip(Genre,rank))

                return title, des, genres
        
        except:
            sleep(3)
            soup(driver)
    else:
        return "Error404",None,None

In [42]:
def scrape(website_url, driver):
    
    driver.get(website_url)
    sleep(3)
    print("scrape")
    # Extracting links from site
    links = driver.find_elements(By.CLASS_NAME, 'cover-link')
    urls = [link.get_attribute('href') for link in links]
    print(urls)
    urls = [link for link in links if "anime/" in link]
    print(urls)

    try:
        # Click the button
        button = driver.find_element(By.CLASS_NAME, "spoiler-toggle")
        button.click()
        sleep(2)
    except:
        pass
    
    try:
        title, des, genre = soup(driver)
    except:
        title, des, genre = None, None, None
    
    print(title, des, genre, urls)
    return title, des, genre, urls

In [43]:
def write_json(new_data, filename="Anime_data.json"):
    
    # Read the existing JSON data from the file
    existing_data = []
    with open(filename,'r',encoding='utf-8') as file:
        try:
            # First we load existing data into a dict.
            existing_data = json.load(file)
        except:
            existing_data = []
    file.close()
    # Join new_data with existing_data
    existing_data.append(new_data)

    with open(filename,"w",encoding='utf-8') as file:
        
        # convert back to json.
        json.dump(existing_data, file,ensure_ascii=False, indent = 4)
    
    file.close()

In [44]:
def crawler(driver, unvisited_sites, visited_sites):

    # Making a List of website to visit using BSF
    to_visit = list(unvisited_sites - visited_sites)
    print(to_visit)
    # Loop to Scrape one set of to_visit links
    for url in to_visit:
        title, des, genres, urls = scrape(url,driver=driver)
        sleep(2)
        if title != "Error404":
            # Saving them in an list to further save it in Json
            Anime_data = {
                "Anime" : {
                "Anime_Title" : title,
                "Description" : des,
                "Tags" : genres
                },
                "unvisited" : urls,
                "visited" : url
            }

            # Injecting scraped data into json
            write_json(Anime_data,"Anime_data.json")

            # Updating the unvisited and visited sets
            if unvisited_sites != None or unvisited_sites != 'null':
                unvisited_sites.update(urls)
            visited_sites.add(url)
        else:
            visited_sites.add(url)


In [45]:
def load_files(filename='sites_data.json'):
    # Load data from the JSON file
    with open(filename, 'r', encoding='utf-8') as file:
        site_data = json.load(file)

    # Extract the lists from the loaded data
    try:
        visited_sites_list = site_data.get("visited_sites", [])
    except:
        visited_sites_list = []
    try:
        unvisited_sites_list = site_data.get("unvisited_sites", [])
    except:
        visited_sites_list = []
    
    return visited_sites_list, unvisited_sites_list


In [46]:
# Making a set to keep visited & unvisited sites
visited_sites, unvisited_sites = load_files()
visited_sites, unvisited_sites = set(visited_sites), set(unvisited_sites)

In [48]:
# Initializing Driver
driver = webdriver.Chrome()

# Your crawling logic here
driver.switch_to.window(driver.window_handles[0])

try:
    # visting and scraping sites
    while True:
        crawler(driver,unvisited_sites, visited_sites)
except:
    # Create a dictionary to store both sets
    site_data = {
    "visited_sites": list(visited_sites),
    "unvisited_sites": list(unvisited_sites)
    }

    # Save the dictionary to a JSON file
    with open('sites_data.json', 'w', encoding='utf-8') as file:
        json.dump(site_data, file)

# Close the driver properly
# driver.close()
# driver.quit()

['https://anilist.co/anime/21/ONE-PIECE/']
scrape
['https://anilist.co/anime/11061/HUNTERHUNTER-2011/', 'https://anilist.co/anime/20/NARUTO/', 'https://anilist.co/anime/1735/NARUTO-Shippuuden/', 'https://anilist.co/anime/97940/Black-Clover/']
