In [1]:
import requests
import unicodedata
from bs4 import BeautifulSoup, NavigableString
from fake_useragent import UserAgent
import time
import random


def convert_fullwidth_to_halfwidth(text):
    return unicodedata.normalize('NFKC', text)


def get_page_info(url):
    try:
        # Send a request to the URL
        headers = {'User-Agent': UserAgent().random}
        # headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
        # Chrome/122.0.0.0 Safari/537.36'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        # Set response encoding if needed (sometimes needed for correct character display)
        response.encoding = response.apparent_encoding

        # Parse the content of the request with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup

    except requests.RequestException as e:
        print(f"Error: {e}")
        return None


def extract_novel_info(li_elements):
    novel_info = []

    for element in li_elements:
        # Extract text and add to the list
        a_tag = element.find('a')

        # Check if a_tag is not None
        if a_tag:
            title = element.get_text(strip=True)
            href = a_tag['href']
            if title and href and href != "#":
                novel_info.append({"title": convert_fullwidth_to_halfwidth(title), "href": href})

    return novel_info


def cleanup(html_content):
    if html_content is None:
        return ""
    # Extract text and remove unnecessary whitespace
    return html_content.get_text(separator='\n').strip()


def remove_before_second_occurrence(text, substring):
    # Find the first occurrence of the substring
    first_occurrence = text.find(substring)
    if first_occurrence == -1:
        return text

    second_occurrence = text.find(substring, first_occurrence + 1)
    if second_occurrence == -1:
        return text[first_occurrence:]

    # Return the text from the second occurrence onwards
    return text[second_occurrence:]


def extract_content(novel_info):
    novel_content = []
    i = 0
    for info in novel_info:
        novel_url = info["href"]
        soup = get_page_info(novel_url)

        text = soup.find(class_="txtnav")
        text = cleanup(text)
        text = remove_before_second_occurrence(text, "\u2003\u2003")
        text = text.replace("\u2003\u2003", "")
        text = text.replace("\n\n", "\n")

        novel_content.append({"title": info["title"], "content": text})
        i = i+1
        print(info["title"], f"---({i}/{len(novel_info)})finished")
        sleep_time = random.uniform(0.5, 1.5)
        time.sleep(sleep_time)
    return novel_content


def generate_txt(filename, novel_content):
    with open(filename+".txt", "a", encoding='utf-8') as file:
        for i in novel_content:
            file.write(i["title"]+"\n")
            file.write(i["content"]+"\n")

In [2]:
url = 'https://www.69shu.pro/book/53686/'
filename = "只想让玩家省钱的我却被氪成首富"
soup = get_page_info(url)
print("get page info successful")
lists = soup.find_all("li")
novel_infos = extract_novel_info(lists)
print("get novel info successful")


get page info successful
get novel info successful


In [None]:
novel_infos

In [None]:

novel_contents = extract_content(novel_infos)
print("get page content successful")
generate_txt(filename, novel_contents)
print("generate txt successful")


In [3]:
urls = 'https://www.69shu.pro/txt/53686/34787587'
soup = get_page_info(urls)
soup

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0, viewport-fit=cover" name="viewport">
<title>只想让玩家省钱的我却被氪成首富-第43章 游戏首周流水出炉！-69书吧</title>
<meta content="正文 第43章 游戏首周流水出炉！,只想让玩家省钱的我却被氪成首富最新章节,只想让玩家省钱的我却被氪成首富在线阅读" name="keywords"/>
<meta content="只想让玩家省钱的我却被氪成首富最新章节无错/无弹窗广告阅读" name="description"/>
<link href="https://www.69shu.pro/txt/53686/34787588" rel="prefetch"/>
<link href="https://cdn.shucdn.com/css/yuedu.css" rel="stylesheet" type="text/css"/>
<link href="https://cdn.shucdn.com/favicon.ico" rel="shortcut icon">
<meta content="all" name="robots"/> <script src="https://cdn.shucdn.com/js/202403/ad.js"></script>
<script>
        var bookinfo = {
            pageType: 3,
            pageVer: '202308002',
            siteName: '69书吧',
            site: 'https://www.69shu.pro',
            articleid: '53686',
            chapterid: '34787587',
            articlename: '只想让玩家省钱的我却被氪成首富',
            chaptername: '第43章 游戏首周流水出炉！',


In [4]:
text = soup.find(class_="txtnav")
text = cleanup(text)
text = remove_before_second_occurrence(text, "\u2003\u2003")
text

'\u2003\u2003第43章 游戏首周流水出炉！\n\n\r\n\u2003\u2003氪佬们最开始想的办法，是想找一个比较高明的黑客，入侵一下头号玩家公司的后台端口，然后改写充值界面的程序，直接从根源解决问题。\n\n\r\n\u2003\u2003不过由于这种行为已经足够触碰到违法的门槛了，氪佬们也觉得太过于卑鄙。\n\n\r\n\u2003\u2003毕竟，《天际线》这游戏的发酵到目前口碑还是不错的。\n\n\r\n\u2003\u2003有人肯做好游戏，这点无论是平民还是氪佬都不会有非议。\n\n\r\n\u2003\u2003“……跑了。”刻竣顿时磕磕绊绊，辩解似得伸手比比划划，顾左右而言他：“但……跑了……不代表我没钓到过啊。”\n\n\r\n\u2003\u2003刻晋知道。\n\n\r\n\u2003\u2003好在结果是喜人的。\n\u2003\u2003\n\r\n\u2003\u2003上架第一周的流水。\n\n\r\n\u2003\u2003但很快，这个方法也败下阵来。\n\n\r\n\u2003\u2003虽然肯定不及直接开放充值界面赚的多，但也算是浅浅弥补了一下这游戏目前的0收益。\n\n\r\n\u2003\u2003“那就上饿扁了点一条送过来！”刻竣悲愤道。\n\n\r\n\u2003\u2003一看自己老婆要生气，刻竣急忙轻咳一声，不打算触其霉头，一边端起茶水轻抿，一边把话茬抛给了刻雨：“小雨，你想去哪里？”\n\n\r\n\u2003\u2003B计划失败，C计划就会跟着失败。\n\n\r\n\u2003\u2003在临时创建的圣战小组里面，有懂行的专业人士在尝试了几番之后，把失败的截图发到群里。\n\n\r\n\u2003\u2003虽然说，在蓝星上游戏法跟常规的民法刑法不同，它更像是一个维护市场环境的东西，更类似于校规，公司规定这类，没有绝对的权威性，由蓝星特设的部门游戏监管局把控。\n\n\r\n\u2003\u2003“你这不还是吃。”刻竣很无奈的扶了扶眼镜，明显不太能理解女人的脑回路。\n\n\r\n\u2003\u2003同时也侧面检验了自己这一波计划的无懈可击。\n\n\r\n\u2003\u2003动用了最后一种方法。\n\n\r\n\u2003\u2003作为集家中千万宠爱于一身的小公主。\n\n\r\n\

In [None]:
with open("test"+".txt", "a", encoding='utf-8') as file:
    file.write(texts)

In [None]:
def remove_before_second_occurrence(text, substring):
    # Find the first occurrence of the substring
    first_occurrence = text.find(substring)
    if first_occurrence == -1:
        return "Substring not found."

    # Find the second occurrence of the substring
    #second_occurrence = text.find(substring, first_occurrence + 1)
    #if second_occurrence == -1:
        #return "Second occurrence not found."

    # Return the text from the second occurrence onwards
    return text[first_occurrence:]

In [None]:
remove_before_second_occurrence(texts, "\u2003\u2003\t\t第")