### 开发者笔记
作者：***HakureiPOI***

最后更新：2024/8/19

功能：根据关键词对ESJ轻小说进行下载

待实现功能：一大堆

使用手册：见实例化部分

---

### 准备工作

In [None]:
!pip install opencc



In [None]:
import pandas as pd
import requests
import logging
import json
import time
import os
import sys
import re
import bs4
import pickle
from opencc import OpenCC
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

In [None]:
os.makedirs('logs', exist_ok = True)

In [None]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - [%(levelname)s] - %(message)s')

streamHandler = logging.StreamHandler()
streamHandler.setFormatter(formatter)
logger.addHandler(streamHandler)

fileHandler = logging.FileHandler('logs/log.txt')
fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler)

---
### 轻小说爬虫类

以后应该都会按照这个模板来实现

In [None]:
class AbstractScraper():
    def __init__(self, usr, pwd, domain_name, path):
        self.usr = usr
        self.pwd = pwd
        self.domain_name = domain_name
        self.path = path
        self.api = Interface(usr, pwd, domain_name)
        self.keyword = ''
        self.novels = []
        self.simplify = False
        self.login()
    def login(self):
        pass
    def search(self):
        pass
    def print_novels(self):
        pass
    def set_keyword(self, keyword):
        pass
    def set_simplify(self):
        pass
    def get_web_novels(self):
        pass
    def get_local_novels(self):
        pass
    def update_novels(self):
        pass
    def download_novels(self):
        pass
    def save_novels(self):
        pass

In [None]:
class Scraper(AbstractScraper):
    def __init__(self, usr, pwd, domain_name, path):
        super().__init__(usr, pwd, domain_name, path)

    def login(self):
        try:
            response = self.api.login()
            assert response.status_code == 200
            logger.info(f'login success, {self.usr}')
        except Exception as e:
            logger.warning(f'login error, {type(e).__name__}-{e}')

    def set_keyword(self, keyword):
        try:
            self.keyword = str(keyword)
            assert isinstance(self.keyword, str)
            logger.info(f'set keyword to {self.keyword}')

        except Exception as e:
            logger.warning(f'set keyword error, {type(e).__name__}-{e}')

    def set_simplify(self, simplify):
        try:
            self.simplify = bool(simplify)
            logger.info(f'set simplify to {self.simplify}')

        except Exception as e:
            logger.warning(f'set simplify error, {type(e).__name__}-{e}')

    def search(self, novel_type = 0, order_method = 1):
        try:
            response = self.api.search(self.keyword, novel_type, order_method)
            assert response.status_code == 200
            logger.info(f'search success, {self.keyword}')
            return response
        except Exception as e:
            logger.warning(f'search error, {type(e).__name__}-{e}')

    def print_novels(self):
        try:
            print('novels are as follows:')
            for index, novel in enumerate(self.novels):
                print(f'{index + 1 :<4} : {novel}')
            print(f'altogether, there are {len(self.novels)} novels.')

        except Exception as e:
            logger.warning(f'print novels error, {type(e).__name__}-{e}')

    def get_web_novels(self):
        try:
            self.novels.clear()
            novel_type, order_method = 0, 1
            response = self.search(novel_type, order_method)
            assert response.status_code == 200

            current_page = 1
            soup = BeautifulSoup(response.text, 'html.parser')
            script = soup.find_all('script')[-2].text
            match = re.search(r"total:\s(\d+),", script)
            if match:
                total_pages = int(match.group(1))

            while True:
                novel_list = soup.find_all('div', class_ = 'card-body')
                for novel in novel_list:
                    url = novel.find('a')['href']
                    bookname = novel.find('h5', class_ = 'card-title').text.strip()
                    author = novel.find('div', class_ = 'card-author').text.strip()
                    self.novels.append(Novel(url, bookname, author))

                current_page += 1
                if current_page > total_pages:
                    break

                next_page_url = f'/tags-{novel_type}{order_method}/{self.keyword}/{current_page}.html'
                response = self.api.open(next_page_url)
                assert response.status_code == 200
                soup = BeautifulSoup(response.text, 'html.parser')

                logger.info(f'get page success, next page url : {next_page_url}')

            logger.info(f'get novels success, {len(self.novels)} novels')

        except Exception as e:
            logger.warning(f'get novels error, {type(e).__name__}-{e}')

    def get_local_novels(self, filter = False):
        try:
            self.novels.clear()
            for bookname in os.listdir(self.path):
                novel = Novel('', '', '')
                novel.load(f'{self.path}/{bookname}/{bookname}.pkl')
                if filter:
                    detail = novel.bookname + novel.author + novel.introduction + ''.join(novel.tags)
                    match = re.search(self.keyword, detail)
                    if match:
                        self.novels.append(novel)
                else:
                    self.novels.append(novel)

            logger.info(f'get local novels success, {len(self.novels)} novels')

        except Exception as e:
            logger.warning(f'get local novels error, {type(e).__name__}-{e}')

    def update_novels(self):
        try:
            if self.novels == []:
                logger.warning('novels is empty')
                return

            with ThreadPoolExecutor(max_workers = 50) as executor:
                executor.map(self.__update_helper, self.novels)

            logger.info(f'update novels success, {len(self.novels)} novels')

        except Exception as e:
            logger.warning(f'update novels error, {type(e).__name__}-{e}')

    def update_1_novel(self):
        try:
            if self.novels == []:
                logger.warning('novels is empty')
                return

            self.print_novels()

            seleted = int(input('select novel to update : '))
            while seleted < 1 or seleted > len(self.novels):
                print('invalid input')
                seleted = int(input('select novel to update : '))

            novel = self.novels[seleted - 1]
            self.__update_helper(novel)

        except Exception as e:
            logger.warning(f'update 1 novel error, {type(e).__name__}-{e}')

    def __update_helper(self, novel):
        try:
            response = self.api.open(novel.url)
            assert response.status_code == 200
            soup = BeautifulSoup(response.text, 'html.parser')

            last_update = soup.find('strong', string = '更新日期:').parent.text
            match = re.search(r'(\d{4}-\d{2}-\d{2})', last_update)
            if match:
                last_update = match.group(1)

            if last_update != novel.last_update:
                novel.last_update = last_update
                novel.introduction = soup.find('div', class_ = 'description').text.strip()
                novel.tags = [tag.text for tag in soup.find_all('a', class_ = 'tag')]
                chapterList = soup.find_all('a', {'data-title' : re.compile(r'.+')})

                for index, chapter in enumerate(chapterList):
                    title = chapter['data-title']
                    url = chapter['href']
                    novel.chapters.loc[index] = [index, title, url]

                self.download_novels(novel)

                logger.info(f'update novel success, {novel}')

            else:
                logger.info(f'novel not updated, {novel}')

        except Exception as e:
            logger.warning(f'update helper error, {novel} : {type(e).__name__}-{e}')

    def download_novels(self, novel):
        try:
            for index in range(len(novel.chapters)):
                title = f'第 {index + 1} 章 - {novel.chapters.loc[index, "title"]}\n'
                novel.text.append(title)

                response = self.api.direct_open(novel.chapters.loc[index, 'url'])
                assert response.status_code == 200
                soup = BeautifulSoup(response.text, 'html.parser')
                passages = soup.find('div', class_ = re.compile(r'forum'))

                if passages is None:
                    continue
                else:
                    passages = passages.find_all('p', string = re.compile(r'.+'))

                for p in passages:
                    novel.text.append(p.text)

                logger.info(f'download chapter success, {novel.bookname} : {title[:-1]}')

            novel.save(self.path, self.simplify)

        except Exception as e:
            logger.warning(f'download novels error, {type(e).__name__}-{e}')

    def save_novels(self):
        try:
            for novel in self.novels:
                novel.save(self.path)
                logger.info(f'save novel success, {novel}')

        except Exception as e:
            logger.warning(f'save novels error, {type(e).__name__}-{e}')

---
### 小说类
感觉如果要定期维护更新需要把这个类写下来或者把 url 和 last_update 写进小说的本地文件中啊

或者说保存的时候保存到一个文件夹，里面存点别的东西

In [None]:
class Novel():
    def __init__(self, url, bookname, author):
        self.url = url
        self.bookname = bookname
        self.author = author
        self.last_update = ''
        self.introduction = ''
        self.tags = []
        self.chapters = pd.DataFrame(columns = ['index', 'title', 'url'])
        self.text = []

    def __str__(self):
        return f'{self.bookname} - {self.author}'

    def load(self, path):
        try:
            with open(path, 'rb') as f:
                novel = pickle.load(f)
                self.__dict__.update(novel.__dict__)

        except Exception as e:
            logger.warning(f'load novel error, {type(e).__name__}-{e}')

    def save(self, path, simplify = False):
        try:
            os.makedirs(f'{path}/{self.bookname}', exist_ok = True)

            with open(f'{path}/{self.bookname}/{self.bookname}.pkl', 'wb') as f:
                pickle.dump(self, f)

            with open(f'{path}/{self.bookname}/{self.bookname}.txt', 'w', encoding = 'utf-8') as f:
                f.write(self.bookname + '\n')
                f.write(self.author + '\n')
                f.write(self.introduction + '\n')
                f.write('\n'.join(self.text))

            if simplify:
                with open(f'{path}/{self.bookname}/{self.bookname}.txt', 'r', encoding = 'utf-8') as f:
                    text = f.read()
                    cc = OpenCC('t2s')
                    simplified_text = cc.convert(text)
                    with open(f'{path}/{self.bookname}/{self.bookname}（简中）.txt', 'w', encoding = 'utf-8') as f:
                        f.write(simplified_text)

            logger.info(f'save novel success, {self}')

        except Exception as e:
            logger.warning(f'save novel error, {type(e).__name__}-{e}')

---
### 接口类

主要完成获取 response 的功能

In [None]:
class Interface():
    def __init__(self, usr, pwd, domain_name):
        self.usr = usr
        self.pwd = pwd
        self.domain_name = domain_name
        self.session = requests.Session()
        adapter = requests.adapters.HTTPAdapter(pool_connections = 100, pool_maxsize = 100)
        self.session.mount('http://', adapter)
        self.session.mount('https://', adapter)
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
        }

    def __post(self, url, data, retries = 3):
        for i in range(retries):
            try:
                response = self.session.post(url, data = data, headers = self.headers)
                response.encoding = response.apparent_encoding
                response.raise_for_status()
                return response
            except Exception as e:
                logger.warning(f'requests post error on attempt {i + 1}, {type(e).__name__}-{e}')
                time.sleep(1)

    def __get(self, url, retries = 3):
        for i in range(retries):
            try:
                response = self.session.get(url, headers = self.headers)
                response.encoding = response.apparent_encoding
                response.raise_for_status()
                return response
            except Exception as e:
                logger.warning(f'requests get error on attempt {i + 1}, {type(e).__name__}-{e}')
                time.sleep(1)

    def login(self):
        url = f'{self.domain_name}inc/mem_login.php'
        data = {
            'email': self.usr,
            'pwd': self.pwd,
            'remember_me' : 'on'
        }
        return self.__post(url, data)

    def search(self, keyword, novel_type, order_method):
        url = f'{self.domain_name}tags-{novel_type}{order_method}/{keyword}/'
        return self.__get(url)

    def open(self, url):
        url = f'{self.domain_name}{url}'
        return self.__get(url)

    def direct_open(self, url):
        return self.__get(url)

---
### 实例化

***使用方法：***
* Step1: 使用 **usr, pwd, domain_name, path** 实例化一个 Scraper 对象

|<font size='3'>变量名</font>|<font size='4'>含义</font>|
|---|---|
|<font size='3'>usr</font>|<font size='3'>你的用户名（ESJ是邮箱登录）</font>|
|<font size='3'>pwd</font>|<font size='3'>你的密码</font>|
|<font size='3'>domain_name</font>|<font size='3'>ESJ网站域名（https://www.esjzone.cc/）</font>|
|<font size='3'>path</font>|<font size='3'>爬取的数据保存地址</font>|

* Step2: 调用 ***scraper.set_keyword()*** 方法指定搜索关键词（不指定的情况下默认全部）

* Step3: *（可选）*调用 ***scraper.set_simplify()*** 方法来指定是否自动下载简中副本

* Step4: 调用 ***scraper.get_web_novels()*** 或者 ***scraper.get_local_novels()*** 获得小说列表

* Step5: *（可选）*调用 ***print_novels*** 查看现在爬虫获取的轻小说列表

(***scraper.get_local_novels()*** 可以指定参数 **filter** 来决定是否使用关键词对本地小说进行筛选)

* Step6: 调用 ***scraper.update_novels()*** 或 ***scraper.update_1_novel()*** 来对列表中全部小说或者指定小说进行更新

* Step7: 如果使用 **Colab** 运行此程序，可以从左侧的文件中自行下载需要的文件或者自行搭载 **Google Drive** 保存文件  

以下是一个简单的实例:

In [None]:
usr = 'hakureipoi@qq.com'
pwd = 'liuruibb'
domain_name = 'https://www.esjzone.cc/'
path = 'novels'

In [None]:
os.makedirs(path, exist_ok = True)

In [None]:
scraper = Scraper(usr, pwd, domain_name, path)

2024-08-19 05:38:08,469 - [INFO] - login success, hakureipoi@qq.com
INFO:__main__:login success, hakureipoi@qq.com


In [None]:
scraper.set_keyword('R18')

2024-08-19 05:38:08,507 - [INFO] - set keyword to R18
INFO:__main__:set keyword to R18


In [None]:
scraper.set_simplify(True)

2024-08-19 05:38:08,532 - [INFO] - set simplify to True
INFO:__main__:set simplify to True


In [None]:
scraper.get_web_novels()

2024-08-19 05:38:10,060 - [INFO] - search success, R18
INFO:__main__:search success, R18
2024-08-19 05:38:11,248 - [INFO] - get page success, next page url : /tags-01/R18/2.html
INFO:__main__:get page success, next page url : /tags-01/R18/2.html
2024-08-19 05:38:12,898 - [INFO] - get page success, next page url : /tags-01/R18/3.html
INFO:__main__:get page success, next page url : /tags-01/R18/3.html
2024-08-19 05:38:14,212 - [INFO] - get page success, next page url : /tags-01/R18/4.html
INFO:__main__:get page success, next page url : /tags-01/R18/4.html
2024-08-19 05:38:15,378 - [INFO] - get page success, next page url : /tags-01/R18/5.html
INFO:__main__:get page success, next page url : /tags-01/R18/5.html
2024-08-19 05:38:17,006 - [INFO] - get page success, next page url : /tags-01/R18/6.html
INFO:__main__:get page success, next page url : /tags-01/R18/6.html
2024-08-19 05:38:18,384 - [INFO] - get page success, next page url : /tags-01/R18/7.html
INFO:__main__:get page success, next 

In [None]:
scraper.print_novels()

novels are as follows:
1    : 恐怖遊戲女僕生存記 - 김욤뇸(KimYomYum)
2    : 誘惑禁止条例　※但性犯罪受害不算违反条例 - 大中小太郎
3    : 在贞操逆转世界被单亲妈妈收养 - 九头龙八壹
4    : 為尋找對御宅族溫柔的辣妹而走遍三千里。初次交往的女朋友，竟然是我絕對不可能想到的淫娃辣妹。～如今每天沉浸在甜蜜做愛中被懇求授孕的日子～ - 孕間せん
5    : 西方岛Berde·冰雪边陲的女儿 - Frandica_Alanzo
6    : 不要走進那個夜晚 - 借鑒貳下,九月籠城
7    : 一覺醒來變成妻子的我居然還懷孕了 - 加布
8    : 在里世界获取我亲爱的私有物品 - 绯红树叶
9    : 在淫靡的洞窟深處 - 東雲マサキ
10   : 约会大作战：关于Bad End线的五河士道重生的那些事 - 虚无圣母
11   : 我對妹妹的感情，到底為何？ - 虚拟沉默（阿虚）
12   : 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 - 被輪巫女
13   : 特殊技能十分色情 - 被輪巫女
14   : 反逆的勇者〜套路垃圾異世界召喚與日本逆傳送〜 - 川崎悠
15   : 照顾家里蹲的妹妹，不知为何变成了中出美少女 - ドピュリスト村尾
16   : 这是一个关于利用时空跳跃努力改变同学残酷命运的故事。 - Madhatter
17   : 转生为将军的儿子，然后得到反派大小姐。 - さささのよし
18   : 異世界收藏家 - 星見宇佐
19   : 淫邪神的神子姬 ～转生后总是给神明背黑锅不知何时被当成邪神对待而且收到的全都是淫秽的祈祷，比起这个更重要的是为什么转生后是女孩子啊～ - v-sx
20   : 冰糖做的翅膀 - 英俊侠
21   : 魅魔的我 - 百合图书
22   : 哥哥、我那邊的毛已經剃掉了。 - 落花生
23   : 絲襪教in異世界 - 玲音
24   : 在異世界勇者背後暗中活躍的男人 - まじかり
25   : 屠杀鬼~发誓要复仇的野兽的绝伦性爱苦闷的女人们和只属于我的女神~ - 赤だし滑子
26   : 洛丽塔的甘美是由荷叶边和蝴蝶结组成的 ～直到异世界充满少女～ - 論理的楽観主義
27   : 很抱歉，矢代君。 - NJA/rKrf
28   : 怀旧的

In [None]:
scraper.update_1_novel()

novels are as follows:
1    : 恐怖遊戲女僕生存記 - 김욤뇸(KimYomYum)
2    : 誘惑禁止条例　※但性犯罪受害不算违反条例 - 大中小太郎
3    : 在贞操逆转世界被单亲妈妈收养 - 九头龙八壹
4    : 為尋找對御宅族溫柔的辣妹而走遍三千里。初次交往的女朋友，竟然是我絕對不可能想到的淫娃辣妹。～如今每天沉浸在甜蜜做愛中被懇求授孕的日子～ - 孕間せん
5    : 西方岛Berde·冰雪边陲的女儿 - Frandica_Alanzo
6    : 不要走進那個夜晚 - 借鑒貳下,九月籠城
7    : 一覺醒來變成妻子的我居然還懷孕了 - 加布
8    : 在里世界获取我亲爱的私有物品 - 绯红树叶
9    : 在淫靡的洞窟深處 - 東雲マサキ
10   : 约会大作战：关于Bad End线的五河士道重生的那些事 - 虚无圣母
11   : 我對妹妹的感情，到底為何？ - 虚拟沉默（阿虚）
12   : 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 - 被輪巫女
13   : 特殊技能十分色情 - 被輪巫女
14   : 反逆的勇者〜套路垃圾異世界召喚與日本逆傳送〜 - 川崎悠
15   : 照顾家里蹲的妹妹，不知为何变成了中出美少女 - ドピュリスト村尾
16   : 这是一个关于利用时空跳跃努力改变同学残酷命运的故事。 - Madhatter
17   : 转生为将军的儿子，然后得到反派大小姐。 - さささのよし
18   : 異世界收藏家 - 星見宇佐
19   : 淫邪神的神子姬 ～转生后总是给神明背黑锅不知何时被当成邪神对待而且收到的全都是淫秽的祈祷，比起这个更重要的是为什么转生后是女孩子啊～ - v-sx
20   : 冰糖做的翅膀 - 英俊侠
21   : 魅魔的我 - 百合图书
22   : 哥哥、我那邊的毛已經剃掉了。 - 落花生
23   : 絲襪教in異世界 - 玲音
24   : 在異世界勇者背後暗中活躍的男人 - まじかり
25   : 屠杀鬼~发誓要复仇的野兽的绝伦性爱苦闷的女人们和只属于我的女神~ - 赤だし滑子
26   : 洛丽塔的甘美是由荷叶边和蝴蝶结组成的 ～直到异世界充满少女～ - 論理的楽観主義
27   : 很抱歉，矢代君。 - NJA/rKrf
28   : 怀旧的

2024-08-19 05:40:19,288 - [INFO] - download chapter success, 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 : 第 1 章 - 人物介紹（主要角色）
INFO:__main__:download chapter success, 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 : 第 1 章 - 人物介紹（主要角色）
2024-08-19 05:40:20,118 - [INFO] - download chapter success, 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 : 第 2 章 - 人物介紹（其他角色）
INFO:__main__:download chapter success, 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 : 第 2 章 - 人物介紹（其他角色）
2024-08-19 05:40:21,635 - [INFO] - download chapter success, 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 : 第 3 章 - 01.回憶1
INFO:__main__:download chapter success, 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 : 第 3 章 - 01.回憶1
2024-08-19 05:40:22,045 - [INFO] - download chapter success, 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 : 第 4 章 - 02.回憶2
INFO:__main__:download chapter success, 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 : 第 4 章 - 02.回憶2
2024-08-19 05:40:23,021 - [INFO] - download chapter success, 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 : 第 5 章 - 03.回憶3（雙視角）
INFO:__main__:download chapter success, 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 : 第 5 章 - 03.回憶3（雙視角）
2024-08-19 05:40:23,415 - [INFO] - download chapter success, 奇怪的我無法回頭～貞操逆轉世界的異樣家庭 : 第 6 章