# 一、初见网络爬虫

## 1.1 网络连接

In [2]:
from urllib.request import urlopen

html = urlopen('http://pythonscraping.com/pages/page1.html')
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


## 1.2 BeautifulSoup简介

### 1.2.1 BeautifulSoup基础使用

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')

print(f"{bs.h1=}")
print(f"{bs.html.body.h1=}")
print(f"{bs.body.h1=}")
print(f"{bs.html.h1=}")

bs.h1=<h1>An Interesting Title</h1>
bs.html.body.h1=<h1>An Interesting Title</h1>
bs.body.h1=<h1>An Interesting Title</h1>
bs.html.h1=<h1>An Interesting Title</h1>


### 1.2.2 异常处理

In [6]:
from urllib.request import urlopen
from urllib.error import HTTPError

try:
    html = urlopen('http://pythonscraping.com/pages/page1.html')
except HTTPError as e:
    print(e)
    # 中断程序
else:
    # 继续运行程序。
    pass

In [8]:
from urllib.request import urlopen
from urllib.error import HTTPError, URLError

try:
    html = urlopen('http://pythonscrapingwrong.com/this/url/does/not/exist')
except HTTPError as e:
    print(e, 'HTTPError: this is a HTTPError')
except URLError as e:
    print('The server could not be found!')
else:
    print('It worked!')

HTTP Error 502: Bad Gateway HTTPError: this is a HTTPError


In [9]:
try:
    badContent = bs.nonExistingTag.anotherTag
except AttributeError as e:
    print('Tag was not found')
else:
    if badContent == None:
        print('Tag was not found')
    else:
        print(badContent)

Tag was not found


  badContent = bs.nonExistingTag.anotherTag


In [12]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        print(e)
        return None
    try:
        bs = BeautifulSoup(html.read(),'html.parser')
        title = bs.body.h1
    except AttributeError as e:
        return None
    return title

title = getTitle('http://pythonscraping.com/pages/page1.html')
if title == None:
    print('Title could not be found')
else:
    print(title)

<h1>An Interesting Title</h1>
