-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlpaser_item_to_information.py
57 lines (47 loc) · 1.91 KB
/
htmlpaser_item_to_information.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# -*- coding: utf-8 -*-
import urllib.request
import urllib.parse
import re
from lxml import etree
from tables import TVdrama
from html_downloader import HtmlDownloader
def Main(keyword):
try:
url = 'https://baike.baidu.com/item/' + urllib.parse.quote(keyword)
html = HtmlDownloader().download(url)
information_list = html.xpath('//div[contains(@class,"basic-info cmn-clearfix")]//text()')
information_str = ''.join(information_list)
t = TVdrama()
# url
t.url = keyword
# name
# 作品名
keyword_name = ''.join(re.findall(r"(.*?)\/",keyword))
if keyword_name != '':
t.name = keyword_name
else:
t.name = keyword
# area
# 制片地区
t.area= ''.join(re.findall(r"制片地区\s*(.*)\s", information_str))
# Original Release Date
# 播出时间
if ''.join(re.findall(r"首播时间\s*?(\d{4})",information_str)) != '':
original_release_year = ''.join(re.findall(r"首播时间\s*?(\d{4})",information_str)[0:1])
t.original_release_year = int(original_release_year)
elif ''.join(re.findall(r"播出时间\s*?(\d{4})",information_str)) != '':
original_release_year = ''.join(re.findall(r"播出时间\s*?(\d{4})",information_str)[0:1])
t.original_release_year = int(original_release_year)
# Production Date
# 出品时间
production_year = ''.join(re.findall(r"出品时间\s*?(\d{4})",information_str)[0:1])
if production_year != '':
t.production_year = int(production_year)
return t
except :
raise
#debug
if __name__ == '__main__':
keyword = '白狼/231469'
result = Main(keyword)
print ( '结果是:' + result.name,result.original_release_year,result.production_year)