<a href="https://colab.research.google.com/github/HakureiPOI/Douban_Scraper/blob/main/AnimaIndex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
import json
import time
import re
import random
import logging
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

In [None]:
!mkdir -p data
!mkdir -p logs

In [None]:
def setup_logger(name=__name__, log_file='logs/log.txt', level=logging.DEBUG):
    logger = logging.getLogger(name)
    logger.setLevel(level)

    formatter = logging.Formatter('%(asctime)s - [%(levelname)s] - %(message)s')

    # stream_handler = logging.StreamHandler()
    # stream_handler.setFormatter(formatter)
    # logger.addHandler(stream_handler)

    file_handler = logging.FileHandler(log_file)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    return logger

In [None]:
logger = setup_logger()

In [None]:
class Interface():
    def __init__(self):
        self.session = requests.Session()
        adapter = requests.adapters.HTTPAdapter(pool_connections = 100, pool_maxsize = 100)
        self.session.mount('http://', adapter)
        self.session.mount('https://', adapter)
        self.user_agent = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
        ]

    def _post(self, url, data, retries = 3, headers = None):
        for i in range(retries):
            try:
                headers = {
                    'User-Agent' : random.choice(self.user_agent)
                }

                response = self.session.post(url, data = data, headers = headers)
                response.raise_for_status()
                time.sleep(random.random())
                return response

            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 403:
                    logger.warning(f'requests get error: {type(e).__name__}-{e}, skipping')
                    return None
                else:
                    logger.warning(f'requests get error on attempt {i + 1}, {type(e).__name__}-{e}')
                    time.sleep(3)

            except Exception as e:
                logger.warning(f'requests get error on attempt {i + 1}, {type(e).__name__}-{e}')
                time.sleep(3)

    def _get(self, url, retries = 3, headers = None):
        for i in range(retries):
            try:
                headers = {
                    'User-Agent' : random.choice(self.user_agent)
                }

                response = self.session.get(url, headers = headers)
                response.raise_for_status()
                time.sleep(random.random())
                return response

            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 403:
                    logger.warning(f'requests get error: {type(e).__name__}-{e}, skipping')
                    return None
                else:
                    logger.warning(f'requests get error on attempt {i + 1}, {type(e).__name__}-{e}')
                    time.sleep(3)

            except Exception as e:
                logger.warning(f'requests get error on attempt {i + 1}, {type(e).__name__}-{e}')
                time.sleep(3)

In [None]:
api = Interface()

In [None]:
class Scraper():
    def __init__(self, api):
        self.api = api
        self.dataframe = pd.DataFrame(columns = ['index'])

    def __get__animas(self, keyword = '动漫', max_num = 1000, start = 795):
        try:
            url = 'https://search.douban.com/movie/subject_search'
            while len(self.dataframe) < max_num:
                payload = {
                    'search_text' : keyword,
                    'start' : len(self.dataframe) + start,
                }

                response = self.api._post(url, payload)
                soup = BeautifulSoup(response.text, 'html.parser')

                indexes = re.findall(r'"id":\s*(\d+)', soup.find('script', {'type' : 'text/javascript'}).text)
                print(indexes)
                self.dataframe = pd.concat([self.dataframe, pd.DataFrame(indexes, columns = ['index'])], ignore_index = True)

                logger.info(f'Get {len(self.dataframe)} animas')

        except Exception as e:
            logger.error(f'Get animas error: {type(e).__name__}-{e}')

    def __save__animas(self, path = 'data/animas.csv'):
        self.dataframe.to_csv(path, index = False)

    def run(self):
        self.__get__animas()
        self.__save__animas()

In [None]:
scraper = Scraper(api)

In [None]:
scraper.run()