<a href="https://colab.research.google.com/github/HakureiPOI/Douban_Scraper/blob/main/AnimaComment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/HakureiPOI/Douban_Scraper.git

Cloning into 'Douban_Scraper'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 5 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (5/5), 20.88 KiB | 3.48 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [55]:
!git config --global user.name "HakureiPOI"
!git config --global user.email "hakureipoi@qq.com"

In [25]:
import openpyxl
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE

In [2]:
import pandas as pd
import requests
import json
import time
import re
import random
import logging
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

In [3]:
!mkdir -p data
!mkdir -p logs

In [4]:
def setup_logger(name=__name__, log_file='logs/log.txt', level=logging.DEBUG):
    logger = logging.getLogger(name)
    logger.setLevel(level)

    formatter = logging.Formatter('%(asctime)s - [%(levelname)s] - %(message)s')

    # stream_handler = logging.StreamHandler()
    # stream_handler.setFormatter(formatter)
    # logger.addHandler(stream_handler)

    file_handler = logging.FileHandler(log_file)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    return logger

In [5]:
logger = setup_logger()

In [6]:
class Interface():
    def __init__(self):
        self.session = requests.Session()
        adapter = requests.adapters.HTTPAdapter(pool_connections = 100, pool_maxsize = 100)
        self.session.mount('http://', adapter)
        self.session.mount('https://', adapter)
        self.user_agent = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
        ]

    def _post(self, url, data, retries = 3, headers = None):
        for i in range(retries):
            try:
                headers = {
                    'User-Agent' : random.choice(self.user_agent)
                }

                response = self.session.post(url, data = data, headers = headers)
                response.raise_for_status()
                time.sleep(random.random())
                return response

            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 403:
                    logger.warning(f'requests get error: {type(e).__name__}-{e}, skipping')
                    return None
                else:
                    logger.warning(f'requests get error on attempt {i + 1}, {type(e).__name__}-{e}')
                    time.sleep(3)

            except Exception as e:
                logger.warning(f'requests get error on attempt {i + 1}, {type(e).__name__}-{e}')
                time.sleep(3)

    def _get(self, url, retries = 3, headers = None):
        for i in range(retries):
            try:
                headers = {
                    'User-Agent' : random.choice(self.user_agent)
                }

                response = self.session.get(url, headers = headers)
                response.raise_for_status()
                time.sleep(random.random())
                return response

            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 403:
                    logger.warning(f'requests get error: {type(e).__name__}-{e}, skipping')
                    return None
                else:
                    logger.warning(f'requests get error on attempt {i + 1}, {type(e).__name__}-{e}')
                    time.sleep(3)

            except Exception as e:
                logger.warning(f'requests get error on attempt {i + 1}, {type(e).__name__}-{e}')
                time.sleep(3)

In [7]:
api = Interface()

In [8]:
class Comment():
    def __init__(self, commentator, rating, content, datetime, form = 'short'):
        self.commentator = commentator
        self.rating = rating
        self.content = content
        self.datetime = datetime
        self.form = form

    def __str__(self):
        return f'{self.commentator}-{self.datetime}'

In [9]:
class Anima():
    def __init__(self, index, api):
        self.index = index
        self.url = f'https://movie.douban.com/subject/{index}/'
        self.api = api

        self.title = None
        self.shorts = []
        self.reviews = []

        self.dataframe = pd.DataFrame(columns = ['animal', 'commentator', 'datetime', 'rating', 'content'])

    def __str__(self):
        return f'{self.index}-{self.title}'

    def init(self, retries = 3):
        try:
            for i in range(retries):
                response = self.api._get(self.url)

                if not response:
                    logger.info(f'{self} init failed')
                    return False

                soup = BeautifulSoup(response.text, 'html.parser')
                title = soup.find('span', {'property' : 'v:itemreviewed'})

                if not title:
                    logger.info(f'{self} init waiting, retries : {i}')
                    time.sleep(random.randint(30, 60))
                    continue

                self.title = title.text
                logger.info(f'{self} init success')
                return True

            logger.warning(f'{self} init failed')
            return False

        except Exception as e:
            logger.warning(f'{self} init error, {type(e).__name__}-{e}')

    def __get_reviews(self, retries = 3):
        try:
            while True:
                logger.info(f'{self} get reviews, start : {len(self.reviews)}')

                data = {
                    'start': len(self.reviews),
                }
                url = f'https://movie.douban.com/subject/{self.index}/reviews'

                for i in range(retries):
                    response = self.api._post(url, data)

                    if not response:
                        logger.info(f'{self} get reviews success')
                        return True

                    soup = BeautifulSoup(response.text, 'html.parser')
                    reviews = soup.find_all('div', {'class' : 'main review-item'})

                    if not reviews:
                        logger.info(f'{self} get reviews waiting, retries : {i}')
                        time.sleep(random.randint(30, 60))
                        continue

                    for review in reviews:
                        commnet_id = review.attrs['id']
                        commentator = review.find('a', {'class' : 'name'}).text
                        rating = review.find('span', {'class' : ['allstar50', 'main-title-rating']})

                        if rating:
                            rating = rating.attrs['title']
                        else:
                            rating = None

                        datetime = review.find('span', {'class' : 'main-meta'}).text
                        full_comment = self.api._get(f'https://movie.douban.com/j/review/{commnet_id}/full')

                        if full_comment and full_comment.status_code == 200:
                            try:
                                content = json.loads(full_comment.text)['html']
                                content = BeautifulSoup(content, 'html.parser').text
                                self.reviews.append(Comment(commentator, rating, content, datetime, 'review'))
                            except json.JSONDecodeError as e:
                                logger.warning(f"Get comments error: Invalid JSON response - {e}")
                        else:
                            logger.warning(f"Get comments error: Request failed or empty response")

                    break

                if not reviews:
                    return False

        except Exception as e:
            logger.warning(f'{self} get comments error, {type(e).__name__}-{e}')

    def __get_shorts(self, retries = 3):
        try:
            while True:
                logger.info(f'{self} get shorts, start : {len(self.shorts)}')

                data = {
                    'start': len(self.shorts),
                }
                url = f'https://movie.douban.com/subject/{self.index}/comments'

                for i in range(retries):
                    response = self.api._post(url, data)

                    if not response:
                        logger.info(f'{self} get shorts success')
                        return True

                    soup = BeautifulSoup(response.text, 'html.parser')
                    comments = soup.find_all('div', {'class' : 'comment'})

                    if not comments:
                        logger.info(f'{self} get shorts waiting, retries : {i}')
                        time.sleep(random.randint(30, 60))
                        continue

                    for comment in comments:
                        comment_info = comment.find('span', {'class' : 'comment-info'})
                        commentator = comment_info.find('a').text
                        rating = comment_info.find('span', {'class' : ['allstar50', 'rating']})
                        if rating:
                            rating = rating.attrs['title']
                        else:
                            rating = None
                        datetime = comment_info.find('span', {'class' : 'comment-time'}).attrs['title']
                        content = comment.find('span', {'class' : 'short'}).text

                        self.shorts.append(Comment(commentator, rating, content, datetime, 'short'))

                    break

                if not comments:
                    return False

        except Exception as e:
            logger.warning(f'{self} get comments error, {type(e).__name__}-{e}')

    def get_comments(self, retries = 10):
        if not self.title:
            logger.warning(f'{self} get comments failed, init failed')
            return False

        self.__get_reviews()
        self.__get_shorts()

        for comment in self.reviews + self.shorts:
            self.dataframe = pd.concat([self.dataframe, pd.DataFrame([[self.title, comment.commentator, comment.datetime, comment.rating, comment.content]],
                                                                    columns = ['animal', 'commentator', 'datetime', 'rating', 'content'])])

        logger.info(f'{self} get comments success')
        return True

In [10]:
anima_df = pd.read_csv('/content/Douban_Scraper/animas.csv')

In [11]:
animas = []

In [12]:
for index, row in anima_df.head(50).iterrows():
    animas.append(Anima(row['index'], api))

In [13]:
with ThreadPoolExecutor(max_workers = 50) as executor:
    executor.map(lambda anima : anima.init(), animas)

INFO:__main__:25796222-玉子爱情故事 たまこラブストーリー init success
INFO:__main__:35691909-鬼灭之刃 浅草篇 鬼滅の刃 浅草編 init success
INFO:__main__:26653375-我的英雄学院 僕のヒーローアカデミア init success
INFO:__main__:35205803-堀与宫村 ホリミヤ init success
INFO:__main__:34456027-异度侵入 イド：インヴェイデッド ID:INVADED init success
INFO:__main__:35118256-赛博朋克：边缘行者 Cyberpunk: Edgerunners init success
INFO:__main__:4845425-吊带袜天使 パンティ&ストッキングwithガーターベルト init success
INFO:__main__:36779574-仙逆 年番 init success
INFO:__main__:26925611-你想活出怎样的人生 君たちはどう生きるか init success
INFO:__main__:25851655-四月是你的谎言 四月は君の嘘 init success
INFO:__main__:35258427-间谍过家家 第一季 SPY×FAMILY init success
INFO:__main__:33392923-86 -不存在的战区- 86―エイティシックス― init success
INFO:__main__:26752075-排球少年 第三季 ハイキュー!! 烏野高校 VS 白鳥沢学園高校 init success
INFO:__main__:34925662-回复术士的重来人生 回復術士のやり直し init success
INFO:__main__:35801594-迷宫饭 ダンジョン飯 init success
INFO:__main__:1310177-东京教父 東京ゴッドファーザーズ init success
INFO:__main__:1946839-寒蝉鸣泣之时 ひぐらしのなく頃に init success
INFO:__main__:27080656-普罗米亚 プロメア init success
INFO

In [14]:
with ThreadPoolExecutor(max_workers = 50) as executor:
    executor.map(lambda anima : anima.get_comments(), animas)

INFO:__main__:34456027-异度侵入 イド：インヴェイデッド ID:INVADED get reviews, start : 0
INFO:__main__:36440342-我心里危险的东西 第二季 僕の心のヤバイやつ 2期 get reviews, start : 0
INFO:__main__:36779574-仙逆 年番 get reviews, start : 0
INFO:__main__:35801594-迷宫饭 ダンジョン飯 get reviews, start : 0
INFO:__main__:36765646-蓦然回首 ルックバック get reviews, start : 0
INFO:__main__:36093351-葬送的芙莉莲 葬送のフリーレン get reviews, start : 0
INFO:__main__:10527275-来自新世界 新世界より get reviews, start : 0
INFO:__main__:36441528-青春猪头少年不会梦到红书包女孩 青春ブタ野郎はランドセルガールの夢を見ない get reviews, start : 0
INFO:__main__:25851655-四月是你的谎言 四月は君の嘘 get reviews, start : 0
INFO:__main__:26653375-我的英雄学院 僕のヒーローアカデミア get reviews, start : 0
INFO:__main__:34925662-回复术士的重来人生 回復術士のやり直し get reviews, start : 0
INFO:__main__:4925398-命运石之门 STEINS;GATE get reviews, start : 0
INFO:__main__:36638192-败北女角太多了！ 負けヒロインが多すぎる！ get reviews, start : 0
INFO:__main__:2149193-日在校园 School Days get reviews, start : 0
INFO:__main__:1441053-妄想代理人 get reviews, start : 0
INFO:__main__:35691909-鬼灭之刃 浅草篇 鬼滅の刃 浅草編 get re

In [15]:
comments = pd.concat([anima.dataframe for anima in animas])

In [16]:
comments

Unnamed: 0,animal,commentator,datetime,rating,content
0,异度侵入 イド：インヴェイデッド ID:INVADED,丘贝贝Kyubei,2020-03-19 21:04:19,还行,写作存在其共性：动画剧本也好，期刊论文与专著也罢，作者一定要有一个认知：不要想着在有限的空间...
0,异度侵入 イド：インヴェイデッド ID:INVADED,晓宇,2020-02-10 21:31:36,力荐,【写在最前边】感谢大家一直以来的点赞、回复、收藏、转发，一路走来大家辛苦了。13话的处理我是...
0,异度侵入 イド：インヴェイデッド ID:INVADED,zx2337,2020-01-27 11:58:34,力荐,把我写在B站的长评也搬过来一份吧看完第5集后，我在评论区发了关于JW真实身份的猜想，讨论了J...
0,异度侵入 イド：インヴェイデッド ID:INVADED,未来事务管理局,2020-04-08 22:17:59,力荐,“一部动画，能够在每集20分钟的螺蛳壳里做出如此多变的领域致敬，可以说是狂热粉丝和匠心制作...
0,异度侵入 イド：インヴェイデッド ID:INVADED,尤莲,2020-03-06 18:33:30,力荐,这两天被强烈安利了《异度侵入ID：INVADED》，男主史称话疗圣手。因而抽空看了。为了让自...
...,...,...,...,...,...
0,跃动青春 スキップとローファー,momo,2023-06-15 05:47:43,很差,看不下去，雌竞描写和引导太多了导致评论区都在为男主掐架，设定也很恶心。
0,跃动青春 スキップとローファー,坂口健大郎,2023-06-12 06:42:06,还行,全世界都流行糖水泡泡，世界人民是有多苦。本番女主的设定还附加刘姥姥进大观园的向下关怀感……
0,跃动青春 スキップとローファー,麦子麦子酱,2023-07-12 22:28:39,力荐,治愈啊治愈 中年人需要纯粹的青春！
0,跃动青春 スキップとローファー,晓晓一,2023-07-21 12:16:00,推荐,像片头的歌舞一样，是一部特别轻盈、过于耀眼的青春童话。中年人看的时候忍不住怀疑在现实中单纯能...


In [26]:
def remove_illegal_chars(value):
    if isinstance(value, str):
        return ILLEGAL_CHARACTERS_RE.sub(r'', value)
    else:
        return value

In [27]:
comments = comments.map(remove_illegal_chars)

  comments = comments.applymap(remove_illegal_chars)


In [59]:
comments.to_excel('/content/Douban_Scraper/comments.xlsx', index = False)

In [71]:
!git remote set-url git@github.com:HakureiPOI/Douban_Scraper.git

usage: git remote set-url [--push] <name> <newurl> [<oldurl>]
   or: git remote set-url --add <name> <newurl>
   or: git remote set-url --delete <name> <url>

    --push                manipulate push URLs
    --add                 add URL
    --delete              delete URLs



In [70]:
!git push --set-upstream origin main

fatal: unable to access 'https://HakureiPOI/Douban_Scraper.git/': Could not resolve host: HakureiPOI


In [61]:
!git add /content/Douban_Scraper
!git commit -m "update"
!git push

On branch main
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
  (commit or discard the untracked or modified content in submodules)
	[31mmodified:   Douban_Scraper[m (untracked content)

no changes added to commit (use "git add" and/or "git commit -a")
fatal: The current branch main has no upstream branch.
To push the current branch and set the remote as upstream, use

    git push --set-upstream origin main

