In [2]:
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import json

In [3]:
def soup(driver, call=0):
    if call < 15:
        try:
            popup_html = driver.page_source

            # Parse the HTML content with BeautifulSoup
            popup_soup = BeautifulSoup(popup_html, 'html.parser')

            ## Scraping relevant information from the popup using BeautifulSoup
            # Find all <a> tags with the desired class
            genre_tags = popup_soup.find_all('a', class_='el-tooltip name')

            # Find all <div> tags with the class "rank"
            rank_tags = popup_soup.find_all('div', class_='rank')

            # Extract the title of the anime
            title = popup_soup.find("h1").get_text(strip=True)

            # Extract the description/synopsis
            des = popup_soup.find("p", class_="description content-wrap").get_text(strip=True)

            average_score = popup_soup.find("div", class_='el-tooltip data-set')

            # Find the div element with class "value" and extract its text
            value_element = average_score.find('div', class_='value')
            value_text = value_element.text

            if title != None and des != None and  genre_tags != None and  rank_tags != None:
                # make list to save the info
                Genre = []
                rank = []

                # Extract and print the text content of each <a> tag
                for tag in genre_tags:
                    genre = tag.get_text(strip=True)
                    Genre.append(genre)

                # Extract and print the text content of rank tags
                for rank_tag in rank_tags:
                    rank_text = rank_tag.get_text(strip=True)
                    rank.append(rank_text)
                genres = dict(zip(Genre,rank))
                # print("try")

                return title, des, genres, value_text
    
        except:
            sleep(call/5)
            soup(driver, call=call+1)
    else:
        print("failsoup : ", driver.current_url)
        return "Error404", None, None, None


In [4]:
def scrape(website_url, driver):
    
    driver.get(website_url)
    sleep(2)
    #print("scrape")
    
    if driver.current_url == "https://anilist.co/404":
        print("proc")
        return "Error404", None, None, None, None

    # Extracting links from site
    links = driver.find_elements(By.CLASS_NAME, 'cover-link')
    if links == []:
        x = 0
        while len(links) == 0 and x < 4:
            driver.get(website_url)
            sleep(x)
            links = driver.find_elements(By.CLASS_NAME, 'cover-link')
            print(links)
            x += 1
            
    urls = [link.get_attribute('href') for link in links]
    urls = [link for link in urls if "/anime/" in link ]
    
    # print("links")
    try:
        # Click the button
        button = driver.find_element(By.CLASS_NAME, "spoiler-toggle")
        button.click()
        sleep(1)
    except:
        # print("exc")
        pass
    
    # try:
    #     title, des, genre, average_score = soup(driver)
    # except:
    #     title, des, genre = "Error404", None, None
    title, des, genre, average_score = soup(driver)
    
    return title, des, genre, average_score, urls

In [5]:
def write_json(new_data, filename="Anime_data.json"):
    
    # Read the existing JSON data from the file
    existing_data = []
    with open(filename,'r',encoding='utf-8') as file:
        try:
            # First we load existing data into a dict.
            existing_data = json.load(file)
        except:
            existing_data = []
    file.close()
    # Join new_data with existing_data
    existing_data.append(new_data)

    with open(filename,"w",encoding='utf-8') as file:
        
        # convert back to json.
        json.dump(existing_data, file,ensure_ascii=False, indent = 4)
    
    file.close()

In [6]:
def crawler(driver, unvisited_sites, visited_sites):

    # Making a List of website to visit using BSF
    to_visit = list(unvisited_sites - visited_sites)
    print(to_visit)
    # Loop to Scrape one set of to_visit links
    for url in to_visit:
        title, des, genres, average_score, urls = scrape(url,driver=driver)
        if title != "Error404":
            # Saving them in an list to further save it in Json
            Anime_data = {
                "Anime" : {
                "Anime_Title" : title,
                "Description" : des,
                "Tags" : genres,
                "Average_scores": average_score
                },
                "unvisited" : urls,
                "visited" : url
            }
            # print("json")
            # Injecting scraped data into json
            write_json(Anime_data,"Anime_data.json")

            # Updating the unvisited and visited sets
            if urls != None or urls != 'null' or len(urls) != 0:
                unvisited_sites.update(urls)
                # print("if")
            visited_sites.add(url)
        else:
            break


In [7]:
def load_files(filename='sites_data.json'):
    # Load data from the JSON file
    with open(filename, 'r', encoding='utf-8') as file:
        site_data = json.load(file)

    # Extract the lists from the loaded data
    try:
        visited_sites_list = site_data.get("visited_sites", [])
    except:
        visited_sites_list = []
    try:
        unvisited_sites_list = site_data.get("unvisited_sites", [])
    except:
        visited_sites_list = []
    
    return visited_sites_list, unvisited_sites_list


In [8]:
# Making a set to keep visited & unvisited sites
def load():
    visited_sites, unvisited_sites = load_files()
    visited_sites, unvisited_sites = set(visited_sites), set(unvisited_sites)
    return visited_sites, unvisited_sites

In [9]:
def scraper():

    # Initializing Driver
    driver = webdriver.Chrome()

    # Your crawling logic here
    driver.switch_to.window(driver.window_handles[0])

    visited_sites, unvisited_sites = load()

    try:
        # visting and scraping sites
        while True:
            crawler(driver,unvisited_sites, visited_sites)
    except:
        # Create a dictionary to store both sets
        site_data = {
        "visited_sites": list(visited_sites),
        "unvisited_sites": list(unvisited_sites)
        }

        # Save the dictionary to a JSON file
        with open('sites_data.json', 'w', encoding='utf-8') as file:
            json.dump(site_data, file)
        
        # Close the driver properly
        driver.close()
        driver.quit()

In [None]:
if __name__ == "__main__":
    try:
        scraper()
    except Exception as e:
        print(f'Exception : {e}')
        scraper()


In [None]:
import logging
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import json

logging.basicConfig(filename='debug.log', level=logging.ERROR)

def soup(driver, call=0):
    if call < 15:
        try:
            popup_html = driver.page_source
            popup_soup = BeautifulSoup(popup_html, 'html.parser')
            genre_tags = popup_soup.find_all('a', class_='el-tooltip name')
            rank_tags = popup_soup.find_all('div', class_='rank')
            title = popup_soup.find("h1").get_text(strip=True)
            des = popup_soup.find("p", class_="description content-wrap").get_text(strip=True)
            average_score = popup_soup.find("div", class_='el-tooltip data-set')
            value_element = average_score.find('div', class_='value')
            value_text = value_element.text

            if title != None and des != None and  genre_tags != None and  rank_tags != None:
                Genre = []
                rank = []
                for tag in genre_tags:
                    genre = tag.get_text(strip=True)
                    Genre.append(genre)
                for rank_tag in rank_tags:
                    rank_text = rank_tag.get_text(strip=True)
                    rank.append(rank_text)
                genres = dict(zip(Genre,rank))
                return title, des, genres, value_text
        except Exception as e:
            logging.error(f'Exception occurred in soup function: {e}')
            sleep(call/5)
            soup(driver, call=call+1)
    else:
        logging.info(f"failsoup : {driver.current_url}")
        return "Error404", None, None, None

def scrape(website_url, driver):
    try:
        driver.get(website_url)
        sleep(2)
        logging.info("scrape")
        if driver.current_url == "https://anilist.co/404":
            logging.info("proc")
            return "Error404", None, None, None, None
        links = driver.find_elements(By.CLASS_NAME, 'cover-link')
        if links == []:
            x = 0
            while len(links) == 0 and x < 4:
                driver.get(website_url)
                sleep(x)
                links = driver.find_elements(By.CLASS_NAME, 'cover-link')
                logging.info(links)
                x += 1
        urls = [link.get_attribute('href') for link in links]
        urls = [link for link in urls if "/anime/" in link ]
        logging.info("links")
        try:
            button = driver.find_element(By.CLASS_NAME, "spoiler-toggle")
            button.click()
            sleep(1)
        except Exception as e:
            logging.error(f'Exception occurred: {e}')
            pass
        title, des, genre, average_score = soup(driver)
        return title, des, genre, average_score, urls
    except Exception as e:
        logging.error(f'Exception occurred in scrape function: {e}')
        return "Error404", None, None, None, None

def write_json(new_data, filename="Anime_data.json"):
    try:
        existing_data = []
        with open(filename,'r',encoding='utf-8') as file:
            try:
                existing_data = json.load(file)
            except:
                existing_data = []
        file.close()
        existing_data.append(new_data)
        with open(filename,"w",encoding='utf-8') as file:
            json.dump(existing_data, file,ensure_ascii=False, indent = 4)
        file.close()
    except Exception as e:
        logging.error(f'Exception occurred in write_json function: {e}')

def crawler(driver, unvisited_sites, visited_sites):
    try:
        to_visit = list(unvisited_sites - visited_sites)
        logging.info(to_visit)
        for url in to_visit:
            title, des, genres, average_score, urls = scrape(url,driver=driver)
            if title != "Error404":
                Anime_data = {
                    "Anime" : {
                    "Anime_Title" : title,
                    "Description" : des,
                    "Tags" : genres,
                    "Average_scores": average_score
                    },
                    "unvisited" : urls,
                    "visited" : url
                }
                write_json(Anime_data,"Anime_data.json")
                if urls != None or urls != 'null' or len(urls) != 0:
                    unvisited_sites.update(urls)
                visited_sites.add(url)
            else:
                break
    except Exception as e:
        logging.error(f'Exception occurred in crawler function: {e}')

def load_files(filename='sites_data.json'):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            site_data = json.load(file)
        visited_sites_list = site_data.get("visited_sites", [])
        unvisited_sites_list = site_data.get("unvisited_sites", [])
        return visited_sites_list, unvisited_sites_list
    except Exception as e:
        logging.error(f'Exception occurred in load_files function: {e}')
        return [], []

def load():
    try:
        visited_sites, unvisited_sites = load_files()
        visited_sites, unvisited_sites = set(visited_sites), set(unvisited_sites)
        return visited_sites, unvisited_sites
    except Exception as e:
        logging.error(f'Exception occurred in load function: {e}')
        return set(), set()

def scraper():
    try:
        driver = webdriver.Chrome()
        driver.switch_to.window(driver.window_handles[0])
        visited_sites, unvisited_sites = load()
        while True:
            crawler(driver,unvisited_sites, visited_sites)
    except Exception as e:
        logging.error(f'Exception occurred in scraper function: {e}')
        site_data = {
        "visited_sites": list(visited_sites),
        "unvisited_sites": list(unvisited_sites)
        }
        with open('sites_data.json', 'w', encoding='utf-8') as file:
            json.dump(site_data, file)
        driver.close()
        driver.quit()

if __name__ == "__main__":
    try:
        scraper()
    except Exception as e:
        logging.error(f'Exception occurred in main function: {e}')
        scraper()


In [8]:
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import json
import traceback
import selenium.common.exceptions


class AnimeScraper:
    def __init__(self):
        self.driver = None
        self.visited_sites, self.unvisited_sites = self.load_sites_data()

    def load_sites_data(self, filename='Anime_data.json'):
        try:
            # Read Anime_data.json
            with open(filename, 'r', encoding='utf-8') as anime_file:
                anime_data = json.load(anime_file)
            
            visited_sites = {}
            unvisited_sites = {}
            
            for entry in anime_data:
                visited = entry.get("visited")
                unvisited = entry.get("unvisited")
                
                if visited:
                    visited_sites.update(visited)
                if unvisited:
                    unvisited_sites.update(unvisited)
            print(unvisited_sites, visited_sites)

            return visited_sites, unvisited_sites
        except:
            return set(), set()

    # def save_sites_data(self, filename='sites_data.json'):
    #     site_data = {
    #         "visited_sites": list(self.visited_sites),
    #         "unvisited_sites": list(self.unvisited_sites),
    #     }
    #     with open(filename, 'w', encoding='utf-8') as file:
    #         json.dump(site_data, file)

    def init_driver(self):
        self.driver = webdriver.Chrome()

    def close_driver(self):
        if self.driver:
            self.driver.quit()

    def visit_url(self, url):
        if self.driver:
            self.driver.get(url)
            sleep(2)

    def is_404_page(self):
        if self.driver:
            return self.driver.current_url == "https://anilist.co/404"
        return False

    def find_links(self, max_tries):
        if self.driver:
            try:
                for _ in range(max_tries):
                    # Find all links inside the 'cover-link' class divs
                    link_elements = self.driver.find_elements(By.CLASS_NAME, 'cover-link')
                    urls = [link.get_attribute('href') for link in link_elements]
                    valid_urls = [url for url in urls if "/anime/" in url]
                    if len(valid_urls) != 0:
                        return valid_urls
                    else:
                        sleep(0.1)
            except Exception as e:
                print(f"Error while finding links: {e}")
        return []

    def isdriveralive(self):
        try:
            self.driver.current_url
            # or driver.title
            return True
        except:
            return False

    def click_spoiler_button(self):
        if self.driver:
            try:
                button = self.driver.find_element(By.CLASS_NAME, "spoiler-toggle")
                button.click()
                sleep(1)
            except:
                pass

    def parse_popup(self):
        try:
            if self.driver:
                popup_html = self.driver.page_source
                popup_soup = BeautifulSoup(popup_html, 'html.parser')

                title = popup_soup.find("h1").get_text(strip=True)
                des = popup_soup.find("p", class_="description content-wrap").get_text(strip=True)
                average_score = popup_soup.find("div", class_='el-tooltip data-set').find('div', class_='value').text
                genre_tags = popup_soup.find_all('a', class_='el-tooltip name')
                rank_tags = popup_soup.find_all('div', class_='rank')
                status_divs = popup_soup.find_all('div', class_='status')
                status_data = {}

                # Iterate through the status divs and extract the name and amount
                for status_div in status_divs:
                    name = status_div.find('div', class_='name').get_text(strip=True)
                    amount = status_div.find('div', class_='amount').get_text(strip=True).split()[0]  # Extract the number

                    # Store the data in the dictionary
                    status_data[name] = int(amount[:-5])

                if title and des and genre_tags and rank_tags:
                    genres = dict(zip([tag.get_text(strip=True) for tag in genre_tags], [rank.get_text(strip=True) for rank in rank_tags]))
                    return title, des, genres, average_score, status_data
                
        except AttributeError:
            return None
        

    def scrape_url(self, url):
        try:
            self.visit_url(url)
            if self.is_404_page():
                return "Error404", None, None, None, None
            # Fetch valid links after page is fully loaded
            valid_links = self.find_links(200)
            self.click_spoiler_button()
            title, des, genre, average_score, stats = self.parse_popup()
            return title, des, genre, average_score, valid_links, stats
        except TypeError:
            return None        
 
    def write_json(self, new_data, filename="Anime_data.json"):
        existing_data = []
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                existing_data = json.load(file)
        except:
            existing_data = []
        existing_data.append(new_data)

        with open(filename, "w", encoding='utf-8') as file:
            json.dump(existing_data, file, ensure_ascii=False, indent=4)

    def crawl(self):
        try:
            self.init_driver()
            # load site data
            self.visited_sites, self.unvisited_sites = self.load_sites_data()
            print(self.unvisited_sites, self.visited_sites)
            to_visit = list(self.unvisited_sites - self.visited_sites)
            while self.unvisited_sites and len(to_visit) != 0 and self.driver.current_url:
                try:
                    # Update the to_visit list after loading the saved data
                    to_visit = list(self.unvisited_sites - self.visited_sites)
                    if to_visit:
                        for url in to_visit:
                            try:
                                result = self.scrape_url(url)
                                if result:
                                    title, des, genres, average_score, urls, stats = result
                                    Anime_data = {
                                        "Anime": {
                                            "Anime_Title": title,
                                            "Description": des,
                                            "Tags": genres,
                                            "Average_scores": average_score,
                                            "stats": stats
                                        },
                                        "unvisited": urls,
                                        "visited": url
                                    }
                                    self.write_json(Anime_data)
                                    if urls:
                                        self.unvisited_sites.update(urls)
                                    self.visited_sites.add(url)
                                else:
                                    print(f"Error scraping URL: {url}")
                                    self.visited_sites.add(url)  # Add the problematic URL to visited sites
                                    if not(self.isdriveralive()):
                                        break
                            
                            except selenium.common.exceptions.WebDriverException as e:
                                if "unknown error: net::ERR_INTERNET_DISCONNECTED" in str(e):
                                    raise ConnectionError
                                
                                elif "Message: no such window: target window already closed" in str(e):
                                    raise RuntimeWarning
                            
                            except Exception:
                                traceback.print_exc()
                                self.visited_sites.add(url)  # Add the problematic URL to visited sites
                                break

                except Exception:
                    traceback.print_exc()
                    break

        except Exception:
            traceback.print_exc()
            
        finally:
            self.close_driver()

def main():
    try:
        scraper = AnimeScraper()
        scraper.crawl()
    except Exception:
        traceback.print_exc()
        scraper.close_driver()

if __name__ == "__main__":
    main()

In [4]:
import json

def update_sites_data():
    # Read Anime_data.json
    with open('Anime_data.json', 'r', encoding='utf-8') as anime_file:
        anime_data = json.load(anime_file)
    
    visited_sites = []
    unvisited_sites = []
    
    for entry in anime_data:
        visited = entry.get("visited")
        unvisited = entry.get("unvisited")
        
        if visited:
            visited_sites.append(visited)
        if unvisited:
            unvisited_sites.extend(unvisited)

    # Read existing sites_data.json
    with open('sites_data.json', 'r') as sites_file:
        sites_data = json.load(sites_file)

    # Update the data in sites_data.json
    sites_data["visited_sites"] = visited_sites
    sites_data["unvisited_sites"] = unvisited_sites

    # Write the updated data back to sites_data.json
    with open('sites_data.json', 'w') as sites_file:
        json.dump(sites_data, sites_file, indent=4)

# Call the function to update sites_data.json
update_sites_data()
