In [3]:
import requests
from bs4 import BeautifulSoup, NavigableString
from fake_useragent import UserAgent
import time
import random


def get_page_info(url):
    try:
        # Send a request to the URL
        headers = {'User-Agent': UserAgent().random}
        # headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
        # Chrome/122.0.0.0 Safari/537.36'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        # Set response encoding if needed (sometimes needed for correct character display)
        response.encoding = response.apparent_encoding

        # Parse the content of the request with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup

    except requests.RequestException as e:
        print(f"Error: {e}")
        return None


def extract_novel_info(url, li_elements):
    novel_info = []

    for element in li_elements:
        # Extract text and add to the list
        a_tag = element.find('a')

        # Check if a_tag is not None
        if a_tag:
            title = element.get_text(strip=True)
            href = url + a_tag['href']
            novel_info.append({"title": title, "href": href})

    return novel_info


def cleanup(html_content):

    # Extract text and remove unnecessary whitespace
    return html_content.get_text(separator='\n').strip()


def extract_content(novel_info):
    novel_content = []
    for info in novel_info:
        if info["title"] and info["href"]:
            novel_url = info["href"]
            soups = get_page_info(novel_url)

            text = soups.find(class_="content")
            text = cleanup(text)
            novel_content.append({"title": info["title"], "content": text})
            print(info["title"],"---finished")
            sleep_time = random.uniform(1, 3)
            time.sleep(sleep_time)
    return novel_content


def generate_txt(filename, novel_content):
    with open(filename+".txt", "a", encoding='utf-8') as file:
        for i in novel_content:
            file.write("\n"+i["title"])
            file.write("\n"+i["content"])

In [4]:
url='https://www.5200xiaoshuo.com/38_38251/all.html'
filename="从前有座灵剑山"
soup = get_page_info(url)
print("get page info successful")



get page info successful


In [8]:
soup

<!DOCTYPE html>

<html lang="zh-cmn-Hans">
<head>
<meta charset="utf-8"/>
<title>从前有座灵剑山最新章节-国王陛下-从前有座灵剑山txt下载_小说5200阅读网</title>
<meta content="从前有座灵剑山,从前有座灵剑山最新章节" name="keywords"/>
<meta content="从前有座灵剑山最新章节由网友提供，《从前有座灵剑山》情节跌宕起伏、扣人心弦，是一本情节与文笔俱佳的，小说5200阅读网免费提供从前有座灵剑山最新清爽干净的文字章节在线阅读。" name="description"/>
<meta content="novel" property="og:type"/>
<meta content="从前有座灵剑山" property="og:title"/>
<meta content="灵剑派成立于九州历四二三三年，几千年来始终致力于为行业提供一流的修仙人才，如今位列万仙盟五大超品宗派之一，掌门风吟真人担任万仙盟七大常务长老，修为盖世。灵剑派坚持和平与发展的主题，门派核心价值理念是求真、求善、求种。为进一步扩充门派力量，补充新鲜血液，拟于近期召开升仙大会，诚邀各路精英前来。
还是原来的节操
还是一样的搞笑" property="og:description"/>
<meta content="http://www.5200xiaoshuo.com/cover/38/38251/38251s.jpg" property="og:image"/>
<meta content="http://www.5200xiaoshuo.com/38_38251/info.html" property="og:url"/>
<meta content="穿越" property="og:novel:category"/>
<meta content="国王陛下" property="og:novel:author"/>
<meta content="从前有座灵剑山" property="og:novel:book_name"/>
<meta content="http://www.5200xiaoshuo.com/38_38251/all.html" prop

In [10]:
#marker = soup.find('li', class_='col3',string='正文卷')
marker = soup.find('li',string='全部章节')

# List to hold the desired elements
desired_list = []

if marker:
    # Iterate over following siblings
    for sibling in marker.find_next_siblings('li'):
        desired_list.append(sibling)

lists = desired_list
lists

[<li class="col3">升仙大会</li>,
 <li class="col3"><a href="/38_38251/11222451.html">序幕：天外飞仙＋第一章：客栈柴房温暖如春</a></li>,
 <li class="col3"><a href="/38_38251/11222452.html">第二章：来自家乡的土特产</a></li>,
 <li class="col3"><a href="/38_38251/11222453.html">第三章　我的灵根大如萝卜</a></li>,
 <li class="col3"><a href="/38_38251/11222454.html">第四章：老板娘的萝卜</a></li>,
 <li class="col3"><a href="/38_38251/11222455.html">第五章：逆袭的师弟</a></li>,
 <li class="col3"><a href="/38_38251/11222456.html">第六章：倒贴的小海</a></li>,
 <li class="col3"><a href="/38_38251/11222457.html">第七章：这样的弟子才不要</a></li>,
 <li class="col3"><a href="/38_38251/11222458.html">第八章：师弟你是了解我的</a></li>,
 <li class="col3"><a href="/38_38251/11222459.html">第九章：德智体全面发展的优秀弟子</a></li>,
 <li class="col3"><a href="/38_38251/11222460.html">第十章：信不信老子举报你？</a></li>,
 <li class="col3"><a href="/38_38251/11222461.html">第十一章：玩坏了也不心疼啊</a></li>,
 <li class="col3"><a href="/38_38251/11222462.html">第十二章：干奶奶和干奶奶二号之间的修罗场</a></li>,
 <li class="col3"><a href="/38_38251/11222463.html">第十三章：

In [None]:
novel_infos = extract_novel_info("https://www.5200xiaoshuo.com", lists)
print("get novel info successful")
novel_contents = extract_content(novel_infos)
print("get page content successful")
generate_txt(filename, novel_contents)
print("generate txt successful")