In [1]:
# !conda install beautifulsoup4

In [2]:
# !conda install urllib3

In [3]:
import urllib3
from bs4 import BeautifulSoup
import re

In [4]:
url = "https://www.gsb.stanford.edu/business-podcasts/think-fast-talk-smart-podcast"

# 获取TFTS主页
http = urllib3.PoolManager()
r = http.request('GET', url)
print("获取TFTS主页状态：", r.status)

# 解析主页
soup = BeautifulSoup(r.data, 'html.parser')
soup.title.string

# 定位大体位置
target_pos = soup.find('div', class_='views-field views-field-title')

# 解析标题和网址
the_episode_title = str(target_pos.a.string)
the_episode_url = "https://www.gsb.stanford.edu/" + str(target_pos.a["href"])
print("文章标题：", the_episode_title)
print("文章网址：", the_episode_url)

获取TFTS主页状态： 200
文章标题： Quick Thinks: AI Has Entered the Chat — a “Conversation” with ChatGPT
文章网址： https://www.gsb.stanford.edu//insights/quick-thinks-ai-has-entered-chat-conversation-chatgpt


In [5]:
# 获取文章页面
page = http.request("GET", the_episode_url)
print("获取文章页面状态：", page.status)

# 解析文章页面
page_soup = BeautifulSoup(page.data)

# 定位总体位置
pos = page_soup.find("div", class_='announcement-stories__wrapper-information')

获取文章页面状态： 200


In [6]:
# 获取标题
title_pos = pos.find('h1', class_='heading has-icon icon-audio-before has-key-taxonomy-above')
title = str(title_pos.string.strip())

# 获取总结
summary_pos = pos.find('p', class_='intro')
summary = str( summary_pos.string.strip() )

# 获取时间和作者列表
infor_pos = pos.find('div', class_='author-info-wrapper')
infor_time = str(infor_pos.p.string) 
infor_author = [str(x.string) for x in infor_pos.find_all('span', class_='author')]

# 获取介绍和正文
content_pos = pos.find('div', class_='announcement-stories__idea-story-description as-description')
intro_pos, conver_pos = content_pos.find_all('div', class_='field__item field--item-text_block')

intro = ''.join( map(str, intro_pos.find_all('p')) )
intro = re.sub('<p.*?>', '', str(intro))
intro = re.sub('</p>', '\n\n', intro)

conver = ''.join( map(str, conver_pos.find_all("p")) )
conver = re.sub('<p.*?>', '', str(conver))
conver = re.sub('</p>', '\n\n', conver)

print("获取标题、总结、时间、作者、介绍以及正文完成！等待写入tex文件")

获取标题、总结、时间、作者、介绍以及正文完成！等待写入tex文件


In [7]:
# tex文件模板
temple = """\\documentclass{article}
\\usepackage[UTF8]{ctex}
\\usepackage{hyperref}

\\linespread{1.3}
\\setlength{\parskip}{0.5em}
\\setlength{\parindent}{0em}

\\begin{document}

\\section*{%s}

%s

\\begin{flushright}
    \\textit{%s}
\\end{flushright}

%s

\\end{document}
"""

# 格式化内容并写入subscript.tex
infor = infor_time + ' | by ' + ' '.join(infor_author)

content = intro +'''\\vspace{1em}
\\large{\\textbf{TRANSCRIPT}}
\\vspace{1em}
\n\n
''' +conver
content = re.sub("<em>(.*?)</em>", "\\\\textsl{\\1}", content)
content = re.sub("<strong>(.*?)</strong>", "\\\\textbf{\\1}", content)
content = re.sub('<a href=\"(.*?)\">(.*?)</a>', "\\\\href{\\1}{\\2}", content)

tex = temple % (title, summary, infor, content)

with open("./subscript.tex", "w+", encoding="utf-8") as f:
    f.write(tex)
    
print("将内容写入tex文件成功！等待编译pdf")

将内容写入tex文件成功！等待编译pdf


In [8]:
!xelatex subscript.tex
print("编译subscript.pdf成功！等待发送邮箱")

This is XeTeX, Version 3.14159265-2.6-0.99998 (TeX Live 2017/W32TeX) (preloaded format=xelatex)
 restricted \write18 enabled.
entering extended mode
(./subscript.tex
LaTeX2e <2017-04-15>
Babel <3.10> and hyphenation patterns for 6 language(s) loaded.
(d:/App/install/texlive2017/2017/texmf-dist/tex/latex/base/article.cls
Document Class: article 2014/09/29 v1.4h Standard LaTeX document class
(d:/App/install/texlive2017/2017/texmf-dist/tex/latex/base/size10.clo))
(d:/App/install/texlive2017/2017/texmf-dist/tex/latex/ctex/ctex.sty
(d:/App/install/texlive2017/2017/texmf-dist/tex/latex/l3kernel/expl3.sty
(d:/App/install/texlive2017/2017/texmf-dist/tex/latex/l3kernel/expl3-code.tex)
(d:/App/install/texlive2017/2017/texmf-dist/tex/latex/l3kernel/l3xdvipdfmx.def)
)
(d:/App/install/texlive2017/2017/texmf-dist/tex/latex/l3packages/xparse/xparse.
sty)
(d:/App/install/texlive2017/2017/texmf-dist/tex/latex/l3packages/l3keys2e/l3key
s2e.sty)
(d:/App/install/texlive2017/2017/texmf-dist/tex/latex/ctex/