In [4]:
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import json
import traceback
import selenium.common.exceptions
from selenium.webdriver.chrome.options import Options


class AnimeScraper:
    def __init__(self):
        self.driver = None
        # self.chrome_options = Options()

    def load_sites_data(self, filename='Anime_data.json'):
        try:
            # Read Anime_data.json
            with open(filename, 'r', encoding='utf-8') as anime_file:
                anime_data = json.load(anime_file)
            
            visited_sites = set()
            unvisited_sites = set()
            for entry in anime_data:
                
                visited = entry.get("visited")
                unvisited = entry.get("unvisited")
                
                if visited:
                    visited_sites.update(visited)
                if unvisited:
                    unvisited_sites.update(unvisited)

            return visited_sites, unvisited_sites
        except Exception as e:
            print(str(e))
            return set(), set()

    def init_driver(self):
        # self.chrome_options.add_argument("--headless")
        # self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver = webdriver.Chrome()

    def close_driver(self):
        if self.driver:
            self.driver.quit()

    def visit_url(self, url):
        if self.driver:
            self.driver.get(url)
            sleep(2)

    def is_404_page(self):
        if self.driver:
            return self.driver.current_url == "https://anilist.co/404"
        return False

    def find_links(self, max_tries):
        if self.driver:
            try:
                for _ in range(max_tries):
                    # Find all links inside the 'cover-link' class divs
                    link_elements = self.driver.find_elements(By.CLASS_NAME, 'cover-link')
                    urls = [link.get_attribute('href') for link in link_elements]
                    valid_urls = [url for url in urls if "/anime/" in url]
                    if len(valid_urls) != 0:
                        return valid_urls
                    else:
                        sleep(0.1)
            except Exception as e:
                print(f"Error while finding links: {e}")
        return []

    def isdriveralive(self):
        try:
            self.driver.current_url
            # or driver.title
            return True
        except:
            return False

    def click_spoiler_button(self):
        if self.driver:
            try:
                button = self.driver.find_element(By.CLASS_NAME, "spoiler-toggle")
                button.click()
                sleep(1)
            except:
                pass

    def parse_popup(self):
        try:
            if self.driver:
                popup_html = self.driver.page_source
                popup_soup = BeautifulSoup(popup_html, 'html.parser')

                title = popup_soup.find("h1").get_text(strip=True)
                des = popup_soup.find("p", class_="description content-wrap").get_text(strip=True)
                average_score = popup_soup.find("div", class_='el-tooltip data-set').find('div', class_='value').text
                genre_tags = popup_soup.find_all('a', class_='el-tooltip name')
                rank_tags = popup_soup.find_all('div', class_='rank')
                status_divs = popup_soup.find_all('div', class_='status')
                status_data = {}

                # Iterate through the status divs and extract the name and amount
                for status_div in status_divs:
                    name = status_div.find('div', class_='name').get_text(strip=True)
                    amount = status_div.find('div', class_='amount').get_text(strip=True).split()[0]  # Extract the number

                    # Store the data in the dictionary
                    status_data[name] = int(amount[:-5])

                if title and des and genre_tags and rank_tags:
                    genres = dict(zip([tag.get_text(strip=True) for tag in genre_tags], [rank.get_text(strip=True) for rank in rank_tags]))
                    return title, des, genres, average_score, status_data
                
        except AttributeError:
            return None
        

    def scrape_url(self, url):
        try:
            self.visit_url(url)
            if self.is_404_page():
                return "Error404", None, None, None, None
            # Fetch valid links after page is fully loaded
            valid_links = self.find_links(200)
            self.click_spoiler_button()
            title, des, genre, average_score, stats = self.parse_popup()
            return title, des, genre, average_score, valid_links, stats
        except TypeError:
            return None        
 
    def write_json(self, new_data, filename="Anime_data.json"):
        existing_data = []
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                existing_data = json.load(file)
        except:
            existing_data = []
        existing_data.append(new_data)

        with open(filename, "w", encoding='utf-8') as file:
            json.dump(existing_data, file, ensure_ascii=False, indent=4)

    def crawl(self):
        try:
            self.init_driver()
            # load site data
            self.visited_sites, self.unvisited_sites = self.load_sites_data()
            to_visit = list(self.unvisited_sites - self.visited_sites)
            while self.unvisited_sites and len(to_visit) != 0 and self.driver.current_url:
                try:
                    # Update the to_visit list after loading the saved data
                    to_visit = list(self.unvisited_sites - self.visited_sites)
                    if to_visit:
                        for url in to_visit:
                            try:
                                result = self.scrape_url(url)
                                if result:
                                    title, des, genres, average_score, urls, stats = result
                                    Anime_data = {
                                        "Anime": {
                                            "Anime_Title": title,
                                            "Description": des,
                                            "Tags": genres,
                                            "Average_scores": average_score,
                                            "stats": stats
                                        },
                                        "unvisited": urls,
                                        "visited": url
                                    }
                                    self.write_json(Anime_data)
                                    if urls:
                                        self.unvisited_sites.update(urls)
                                    self.visited_sites.add(url)
                                else:
                                    print(f"Error scraping URL: {url}")
                                    self.visited_sites.add(url)  # Add the problematic URL to visited sites
                                    if not(self.isdriveralive()):
                                        break
                            
                            except selenium.common.exceptions.WebDriverException as e:
                                if "unknown error: net::ERR_INTERNET_DISCONNECTED" in str(e):
                                    raise ConnectionError
                                
                                elif "Message: no such window: target window already closed" in str(e):
                                    raise RuntimeWarning
                            
                            except Exception:
                                traceback.print_exc()
                                self.visited_sites.add(url)  # Add the problematic URL to visited sites
                                break

                except Exception:
                    traceback.print_exc()
                    break

        except Exception:
            traceback.print_exc()
            
        finally:
            self.close_driver()

def main():
    try:
        scraper = AnimeScraper()
        scraper.crawl()
    except Exception:
        traceback.print_exc()
        scraper.close_driver()

if __name__ == "__main__":
    main()

Traceback (most recent call last):
  File "C:\Users\amit singh\AppData\Local\Temp\ipykernel_13808\586748550.py", line 159, in crawl
    result = self.scrape_url(url)
             ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\amit singh\AppData\Local\Temp\ipykernel_13808\586748550.py", line 124, in scrape_url
    if self.is_404_page():
       ^^^^^^^^^^^^^^^^^^
  File "C:\Users\amit singh\AppData\Local\Temp\ipykernel_13808\586748550.py", line 55, in is_404_page
    return self.driver.current_url == "https://anilist.co/404"
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\amit singh\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\remote\webdriver.py", line 435, in current_url
    return self.execute(Command.GET_CURRENT_URL)["value"]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\amit singh\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\remote\webdriver.py", line 345, in execute
    self.error_handler.check_response(