# 空氣污染監測網 網路爬蟲實作練習


* 能夠利用 selenium + BeautifulSoup 撰寫爬蟲，並存放到合適的資料結構


## 作業目標

根據範例 ，完成以下問題：

* ① 取出 台北市士林區 2018/01 – 2018/08 的 SO2 資料
* ② 取出 台北市士林區 2018/01 – 2018/08 的 SO2、CO 資料





In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time

In [2]:
url = 'http://taqm.epa.gov.tw/taqm/tw/MonthlyAverage.aspx'

browser = webdriver.Chrome(executable_path='../Chromedriver/chromedriver')
browser.get(url)

In [3]:
selectSite = Select(browser.find_element_by_id('ctl05_ddlSite'))
selectSite.select_by_value('11')
selectDate = Select(browser.find_element_by_id('ctl05_ddlYear'))
selectDate.select_by_value('2018')
browser.find_element_by_id('ctl05_btnQuery').click()

### ① 取出 台北市士林區 2018/01 – 2018/08 的 SO2 資料

In [4]:
time.sleep(2)

html_source = browser.page_source

soup = BeautifulSoup(html_source, 'lxml')
print(soup)

<html lang="zh-TW" xmlns="http://www.w3.org/1999/xhtml"><head id="Head1"><base href="https://taqm.epa.gov.tw/taqm/tw/"/><meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><title>行政院環境保護署－空氣品質監測網 - 月平均值查詢</title><meta content="空氣品質監測網(TAQM)" name="title"/>
<!--link rel="govtwsimpledc.xsd" href="http://www.gov.tw/schema/dc/" title="Dublin Core" /-->
<link href="/taqm/tw/style/all.min.css?v=20180508c" rel="stylesheet" type="text/css"/>
<script async="" src="https://www.google-analytics.com/analytics.js" type="text/javascript"></script><script src="/taqm/script/jquery.min.js" type="text/javascript"></script>
<script src="/taqm/script/jquery.myslide2.min.js" type="text/javascript"></script>
<script type="text/javascript">
function DownloadReport(url) {
	var win = window.open("DownloadReport.aspx?file="+escape(url),"download" /*,"modal=yes,menubar=no,scrollbars=no,status=no,titlebar=no,toolbar=no,height=200px,width=300px"

In [5]:
table = soup.find('table', class_='TABLE_G')
print(table)

<table align="Center" border="1" cellpadding="3" cellspacing="0" class="TABLE_G" id="ctl05_gv" rules="all" style="border-color:Black;border-width:1px;border-style:None;border-collapse:collapse;">
<tbody><tr style="color:Black;font-weight:normal;">
<th scope="col">監測項目</th><th scope="col">單位</th><th scope="col">監測日期</th><th scope="col">監測值</th><th scope="col">標註</th>
</tr><tr style="color:Black;">
<td class="no-alt" rowspan="12" style="white-space:nowrap;" valign="top">SO2</td><td class="no-alt" rowspan="12" valign="top">ppb</td><td>2018/01</td><td>1.80</td><td> </td>
</tr><tr class="ALT" style="color:Black;">
<td>2018/02</td><td>1.90</td><td> </td>
</tr><tr style="color:Black;">
<td>2018/03</td><td>2.20</td><td> </td>
</tr><tr class="ALT" style="color:Black;">
<td>2018/04</td><td>2.30</td><td> </td>
</tr><tr style="color:Black;">
<td>2018/05</td><td>3.10</td><td> </td>
</tr><tr class="ALT" style="color:Black;">
<td>2018/06</td><td>2.70</td><td> </td>
</tr><tr style="color:Black;">
<td>

In [6]:
d = {}
for tr in table.find_all('tr')[1:]:

    for i, td in enumerate(tr.find_all('td')):
        
        if len(tr.find_all('td')) == 5:
            if i == 0:
                obs = td.text
                d.setdefault(obs, {})
            if i == 2:
                if td.text != '\xa0':
                    date = time.strptime(td.text, '%Y/%m')
            if i == 3:
                value = td.text
                d[obs][date] = value
        
        if len(tr.find_all('td')) == 3:
            if i == 0:
                date = time.strptime(td.text, '%Y/%m')
            if i == 1:
                value = td.text
                d[obs][date] = value

In [7]:
def fetchData(data, fromDate, toDate, items):
    ret = dict()
    for item in items:
        sub = dict()
        for k, v in data[item].items():
            if k >= fromDate and k <= toDate:
                sub[k] = v
        ret[item] = sub
    return ret

In [8]:
fromDate = time.strptime('2018/01', '%Y/%m')
toDate = time.strptime('2018/08', '%Y/%m')

ans1 = fetchData(d, fromDate, toDate, ['SO2'])

print('SO2')
for k, v in ans1['SO2'].items():
    print(time.strftime('%Y/%m', k), v)

SO2
2018/01 1.80
2018/02 1.90
2018/03 2.20
2018/04 2.30
2018/05 3.10
2018/06 2.70
2018/07 2.20
2018/08 2.40


### ② 取出 台北市士林區 2018/01 – 2018/08 的 SO2、CO 資料

In [9]:
fromDate = time.strptime('2018/01', '%Y/%m')
toDate = time.strptime('2018/08', '%Y/%m')

ans2 = fetchData(d, fromDate, toDate, ['SO2','CO'])

In [10]:
for k, v in ans2.items():
    print(k)
    for subk, subv in v.items():
        print(time.strftime('%Y/%m', subk), subv)

SO2
2018/01 1.80
2018/02 1.90
2018/03 2.20
2018/04 2.30
2018/05 3.10
2018/06 2.70
2018/07 2.20
2018/08 2.40
CO
2018/01 0.34
2018/02 0.44
2018/03 0.40
2018/04 0.38
2018/05 0.34
2018/06 0.29
2018/07 0.21
2018/08 0.30


In [11]:
browser.quit()