# Fiction Bot IV
### A notebook based automated fiction scraper + EPub generator
This notebook is able to scrape and download all chapters from a provided internet novel url (biquge.com.cn), then auto generate a well-formatted ePub ebook, with **Table Of Contents** of course!

In [1]:
from bs4 import BeautifulSoup
import requests
import os
import shutil

In [2]:
base_url = "xbiquge.so"
book_url = "https://www.xbiquge.so/book/49842/"
book_dir = "books/"
project_dir = os.getcwd()

In [3]:
project_dir

'D:\\Developer\\FictionBot-IV'

In [4]:
page = requests.get(book_url)
soup = BeautifulSoup(page.text, "html.parser")
book_title = soup.h1.text
author = soup.find("meta", attrs={"property":"og:novel:author"})['content']

In [5]:
class MenuInfo:
    def __init__(self, url, chapter_title):
        self.url = url
        self.chapter_title = chapter_title
        self.id = None
        self.epub_link = None
    
    def get_title(self):
        return self.chapter_title
    
    def get_url(self):
        return self.get_url
    
    def __str__(self):
        return f"{self.id}: {self.chapter_title} - {self.url} - {self.epub_link}"

In [6]:
def encodeXMLText(text):
    text = text.replace("&", "&amp;")
    text = text.replace("\"", "&quot;")
    text = text.replace("'", "&apos;")
    text = text.replace("<", "&lt;")
    text = text.replace(">", "&gt;")
    return text

Fetch all HTML tags of the menu entries, store in `menu_raw`

In [7]:
menu_raw = soup.find_all('dd')
menu_raw = menu_raw[12:]

Then parse href and chapter titiles from each HTML tag, store the information in `menu_info`.

In [8]:
menu_info = []
for index, data in enumerate(menu_raw, 1):
    try:
        m = MenuInfo(url = data.a['href'], chapter_title = data.text)
        menu_info.append(m)
    except:
        pass

In [9]:
for c, d in enumerate(menu_info, 1):
    try:
        d.id = f'chapter_{c}'
        d.epub_link = f'contents/chapter_{c}.html'
    except:
        print(d)
    
for i in range(0, 3):
    print(str(menu_info[i]))

chapter_1: 第一章 多出的24小时 - 31348841.html - contents/chapter_1.html
chapter_2: 第二章 正态分布选书法 - 31348842.html - contents/chapter_2.html
chapter_3: 第三章 静止的世界 - 31348843.html - contents/chapter_3.html


=======================================================================

=======================================================================

### Create Folder Structure for EPub
A sample EPUB archive structure may loook like this
```
mimetype
META-INF/
   container.xml
OEBPS/
  content.opf
  title.html
  contents/
      content.html
  stylesheet.css
  toc.ncx
  images/
     cover.png
```

In [10]:
output_dir = f"./{book_dir}{book_title}"

In [11]:
try:
    os.mkdir(output_dir)
except:
    shutil.rmtree(f"{output_dir}/")
    os.mkdir(output_dir)
    print(f"Cannot find directory {output_dir}, current dir is: {os.getcwd()}")
    pass

try:
    os.mkdir(f"{output_dir}/META-INF")
    os.mkdir(f"{output_dir}/OEBPS")
    os.mkdir(f"{output_dir}/OEBPS/images")
    os.mkdir(f"{output_dir}/OEBPS/contents")
except:
    print(f"Cannot find directory {output_dir}, current dir is: {os.getcwd()}")
    pass

Cannot find directory ./books/我的一天有48小时, current dir is: D:\Developer\FictionBot-IV


### Step 1: Create `mimetype`

Write `application/epub+zip` to the mimetype file

In [12]:
with open(f"{output_dir}/mimetype", "w") as tmp:
    tmp.write("application/epub+zip")

### Step 2: Write below contents to `META-INF/container.xml`


In [13]:
container_xml_content ='''<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
  <rootfiles>
    <rootfile full-path="OEBPS/content.opf"
     media-type="application/oebps-package+xml" />
  </rootfiles>
</container>
'''

In [14]:
with open(f"{output_dir}/META-INF/container.xml", "w", encoding="utf-8") as container:
    container.write(container_xml_content)

### Step 3: Create OPF content with meta data

In [15]:
import uuid

unique_id = str(uuid.uuid1())

unique_id

'81a0dabd-24df-11ed-97a7-10a51d0178df'

In [16]:
opfcontent = f'''<?xml version='1.0' encoding='utf-8'?>
<package xmlns="http://www.idpf.org/2007/opf" xmlns:svg="http://www.w3.org/2000/svg" 
            unique-identifier="{unique_id}" version="2.0">   
    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
%(metadata)s
        <meta name="cover" content="cover-image"/>
    </metadata>
    <manifest>
        <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
        <item id="cover" href="title.html" media-type="application/xhtml+xml"/>
        <item id="cover-image" href="images/cover.jpg" media-type="image/jpeg"/>
%(contents)s
    </manifest>
    <spine toc="ncx">
        <itemref idref="cover" linear="no"/>
%(ncx)s
    </spine>
    <guide>
        <reference href="title.html" type="cover" title="Cover"/>
    </guide>
    
</package>
'''

dc = '\t\t<dc:%(tag)s>%(value)s</dc:%(tag)s>'
dc_id = '\t\t<dc:%(tag)s id="%(id)s" opf:scheme="uuid">%(value)s</dc:%(tag)s>'
item = "\t\t<item id='%(id)s' href='%(epub_href)s' media-type='application/xhtml+xml'/>"
itemref = "\t\t<itemref idref='%(id)s'/>"
metadata = '\n'.join([
    dc % {'tag': 'title', 'value': encodeXMLText(book_title)},
    dc % {'tag': 'creator', 'value': encodeXMLText(author)},
    dc % {'tag': 'language', 'value': "zh"},
    dc_id % {'tag': 'identifier', 'value': encodeXMLText(unique_id), 'id': 'bookid'},
    dc % {'tag': 'description', 'value': "本文档由 Fiction Bot IV 生成。脚本作者 Jack Zhao"},
])

In [17]:
manifest = []
ncx = []
# navpoints = []
for m in menu_info:
    manifest.append(item % {'id': encodeXMLText(m.id), 'epub_href': encodeXMLText(m.epub_link)})
    ncx.append(itemref % {'id': encodeXMLText(m.id)})
#     navpoints.append(navpoint % (m.epub_link, m.id, m.chapter_title, m.epub_link))
print(manifest[0])
print(ncx[0])

		<item id='chapter_1' href='contents/chapter_1.html' media-type='application/xhtml+xml'/>
		<itemref idref='chapter_1'/>


In [18]:
with open(f"{output_dir}/OEBPS/content.opf", "w", encoding="utf-8") as container:
    container.write(opfcontent % {
        'metadata': metadata,
        'contents': '\n'.join(manifest),
        'ncx': '\n'.join(ncx),
    })

### Step 4: NCX content

In [19]:
ncxcontent = f'''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx version="2005-1" xmlns="http://www.daisy.org/z3986/2005/ncx/">
    <head>
      <meta name="dtb:uid" content="{encodeXMLText(unique_id)}"/>
      <meta name="dtb:depth" content="1"/>
      <meta name="dtb:totalPageCount" content="0"/>
      <meta name="dtb:maxPageNumber" content="0"/>
    </head>
    <docTitle>
        <text>%(title)s</text>
    </docTitle>
    <docAuthor>
        <text>%(creator)s</text>
    </docAuthor>
    <navMap>
%(navpoints)s
    </navMap>
</ncx>
'''
navpoint = '''\t\t<navPoint id="%s" playOrder="%d">
\t\t    <navLabel>
\t\t        <text>%s</text>
\t\t    </navLabel>
\t\t    <content src="%s"/>
\t\t</navPoint>'''

In [20]:
navpoints = []
navpoints.append(navpoint % ("cover_page", 1, "封面", "title.html"))
for i, m in enumerate(menu_info,2):
    navpoints.append(navpoint % (encodeXMLText(m.id), i, encodeXMLText(m.chapter_title), encodeXMLText(m.epub_link)))

In [21]:
with open(f"{output_dir}/OEBPS/toc.ncx", "w", encoding="utf-8") as container:
    container.write(ncxcontent % {
        'title': book_title,
        'creator': author,
        'navpoints': '\n'.join(navpoints)
    })

### Save cover image

In [22]:
cover_img = soup.find("img")
if cover_img:
    cover_img = cover_img['src']
    img = requests.get(cover_img, stream=True)
    if img.status_code == 200:
        try:
            with open(f"{output_dir}/OEBPS/images/cover.jpg", "wb") as f:
                shutil.copyfileobj(img.raw, f)
        except:
            print("Image failed")
    del img
else:
    print("No cover image was found")

### Write `title.html`


In [23]:
title_html=f'''<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <title>{book_title}</title>
    <style type="text/css">
      h2 \u007btext-align: center;\u007d
      p \u007btext-align: center;\u007d
      div \u007btext-align: center;\u007d
    </style>
  </head>
  <body>
    <div><img id="cover-image" src="images/cover.jpg" alt="Cover Page"/></div>
    <h2>{encodeXMLText(book_title)}</h2>
    <p>{encodeXMLText(author)}</p>
  </body>
</html>'''

with open(f"{output_dir}/OEBPS/title.html", "w", encoding="utf-8") as writer:
    writer.write(title_html)

## Download!

In [24]:
os.getcwd()

'D:\\Developer\\FictionBot-IV'

In [25]:
template = '''<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>%(title)s</title>
</head>
<body>
<h2>%(title)s</h2>
<div>
%(content)s
</div>
</body>
</html>
'''

In [26]:
from tqdm import tqdm

for  ch in tqdm(menu_info):
    t = ch.epub_link
#     print("正在下载：" + t)
    source = requests.get(book_url + ch.url)
    soup = BeautifulSoup(source.text, "html.parser")
    sentences = soup.find("div", attrs={'id':'content'}).findAll(text=True)
    contents = []
    for s in sentences:
        tmp = s.replace('\xa0', '')
        contents.append(f'<p>{encodeXMLText(tmp)}</p>')
        
    # Remove ads from the first row in contents
    if base_url in contents[0]:
        contents = contents[1:]
    
    # Write data into an html file
    with open(f'{output_dir}/OEBPS/{t}', 'w', encoding="utf-8") as f:
        f.write(template % {
            'title': encodeXMLText(ch.chapter_title),
            'content': '\n'.join(contents)
        })

100%|██████████████████████████████████████████████████████████████████████████████| 1455/1455 [02:12<00:00, 10.97it/s]


## Pack the EPub Book!

In this step, we will zip the folder then turn it into a \*.epub package.

In [27]:
# Collect all files in the folder
print(project_dir)
print(output_dir)
try:
    os.chdir(output_dir)
except:
    pass

D:\Developer\FictionBot-IV
./books/我的一天有48小时


In [28]:
file_paths = []
for root, directories, files in os.walk('.'): 
    for filename in files: 
        # join the two strings in order to form the full filepath. 
        filepath = os.path.join(root, filename) 
        file_paths.append(filepath)

In [29]:
from zipfile import ZipFile
with ZipFile(f"../{book_title}.epub", "w") as z:
    for f in file_paths:
        z.write(f)
        
print(f"Congratulations, {book_title}.epub has been freshly made!")

Congratulations, 我的一天有48小时.epub has been freshly made!


In [30]:
os.chdir(project_dir)

*Reference: https://www.cnblogs.com/linlf03/archive/2011/12/15/2285953.html*
## The End