In [1]:
%load_ext lab_black

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
from douban_exporter_lite.douban_exporter import DoubanExporter
from douban_exporter_lite.misc import HEADERS

In [4]:
cookies = requests.get(
    "https://movie.douban.com/people/ahbei/collect", headers=HEADERS
).cookies

In [5]:
url = "https://book.douban.com/people/ahbei/collect"
r = requests.get(url, cookies=cookies, headers=HEADERS)

In [6]:
soup = BeautifulSoup(r.text, "lxml")

In [7]:
book_items = soup.find_all("li", {"class": "subject-item"})

In [8]:
item = book_items[0]

In [9]:
item

<li class="subject-item">
<div class="pic">
<a class="nbg" href="https://book.douban.com/subject/26832406/" onclick="moreurl(this,{i:'0',query:'',subject_id:'26832406',from:'book_subject_search'})">
<img class="" src="https://img3.doubanio.com/view/subject/m/public/s28908982.jpg" width="90"/>
</a>
</div>
<div class="info">
<h2 class="">
<a href="https://book.douban.com/subject/26832406/" onclick="moreurl(this,{i:'0',query:'',subject_id:'26832406',from:'book_subject_search'})" title="Idea Makers">

    Idea Makers


    
      <span style="font-size:12px;"> : Personal Perspectives on the Lives &amp; Ideas of Some Notable People </span>
</a>
</h2>
<div class="pub">
        
  
  Stephen Wolfram / Wolfram Media, Inc. / 2016-7-7 / USD 22.95

      </div>
<div class="short-note">
<div>
<span class="rating5-t"></span>
<span class="date">2016-07-22
      读过</span>
</div>
<p class="comment">
      Steven Wolfram给Mathematica、NKS有个人或精神关系的前人写的学术传记和个人唁文，或者说着迷于"简单规则，复杂现象"的松散的一群人的精神族谱。每篇都像是NKS的软文，但这

In [10]:
info_keys = [
    "title",
    "writer",
    "publishing_company",
    "publish_date",
    "mark_date",
    "rating",
    "comment",
    "tags",
    "douban_link",
]

In [11]:
info_dict = dict.fromkeys(info_keys)

In [12]:
info_dict["douban_link"] = item.a["href"]

In [13]:
title = item.find("h2").text.strip()
title

'Idea Makers\n\n\n    \n       : Personal Perspectives on the Lives & Ideas of Some Notable People'

In [14]:
info_dict["title"] = (
    ": ".join(list(map(lambda x: x.strip(), title.split(" : "))))
    if ":" in title
    else title
)

In [15]:
meta_data_list = list(
    map(lambda x: x.strip(), item.find("div", {"class": "pub"}).text.split(" / "),)
)

In [16]:
meta_data_list

['Stephen Wolfram', 'Wolfram Media, Inc.', '2016-7-7', 'USD 22.95']

In [17]:
[meta_data for meta_data in meta_data_list if meta_data[0].isdigit()]

['2016-7-7']

In [18]:
try:
    publish_date = next(
        meta_data for meta_data in meta_data_list if meta_data[0].isdigit()
    )
except StopIteration:
    publish_date = None
info_dict["publish_date"] = publish_date

In [19]:
info_dict["publishing_company"] = (
    meta_data_list[meta_data_list.index(publish_date) - 1] if publish_date else None
)

In [20]:
info_dict["mark_date"] = item.find("span", {"class": "date"}).text.split("\n")[0]

In [21]:
rating = item.find("span", {"class": "date"}).find_previous_siblings()
if len(rating) > 0:
    info_dict["rating"] = DoubanExporter.get_rating(rating[0]["class"][0])

In [22]:
comment = item.find("p", {"class": "comment"})
if comment:
    info_dict["comment"] = comment.contents[0].strip()

In [23]:
tags = item.find("span", {"class": "tags"})
if tags:
    info_dict["tags"] = tags.text[3:].strip()

In [24]:
info_dict

{'title': 'Idea Makers: Personal Perspectives on the Lives & Ideas of Some Notable People',
 'writer': None,
 'publishing_company': 'Wolfram Media, Inc.',
 'publish_date': '2016-7-7',
 'mark_date': '2016-07-22',
 'rating': 5,
 'comment': 'Steven Wolfram给Mathematica、NKS有个人或精神关系的前人写的学术传记和个人唁文，或者说着迷于"简单规则，复杂现象"的松散的一群人的精神族谱。每篇都像是NKS的软文，但这是他一辈子的主线，是诚恳的。',
 'tags': None,
 'douban_link': 'https://book.douban.com/subject/26832406/'}

In [25]:
[info_dict[key] for key in info_keys]

['Idea Makers: Personal Perspectives on the Lives & Ideas of Some Notable People',
 None,
 'Wolfram Media, Inc.',
 '2016-7-7',
 '2016-07-22',
 5,
 'Steven Wolfram给Mathematica、NKS有个人或精神关系的前人写的学术传记和个人唁文，或者说着迷于"简单规则，复杂现象"的松散的一群人的精神族谱。每篇都像是NKS的软文，但这是他一辈子的主线，是诚恳的。',
 None,
 'https://book.douban.com/subject/26832406/']