In [3]:
# 股票爬蟲
# 介紹套件
# requests：http客戶端資料庫，用於編寫爬蟲及處理網頁資料
# StringIO：從本機內部資源讀寫string
# pandas： 處理載入數據並將其視覺化，常用於數據分析
# numpy：用於數學運算及資料處理，可將數據轉成array的型式
# 爬蟲來源
# 今天要爬的資料是從以下路徑…
# 台灣證券交易所 > 交易資訊 > 盤後資訊 > 每日收盤行情 >全部(不含權證、牛熊證、可展延牛熊證) > CSV下載
import requests
from io import StringIO
import pandas as pd
import numpy as np

In [16]:
# 2. CSV網址連結的『&date=』後方日期用指定變數的方式置入
# 3. 用『requests.get』方式將網路資料先下載到本機的記憶體裡
#利用requests下載資料
# step1. import package 
import requests
import pandas as pd
import numpy as np
from io import StringIO

# step2. 進入目標網站,爬取盤後資訊
date = '20190321'
r = requests.post('http://www.twse.com.tw/exchangeReport/MI_INDEX?response=csv&date=' + date + '&type=ALL')

# step3. 篩選出個股盤後資訊
# 先使用r.text印出剛剛爬下來的資料，觀察到每個欄位是以逗號分隔、而每一筆資料是以\r\n分隔。
str_list = []
for i in r.text.split('\n'):
    if len(i.split('",')) == 17 and i[0] != '=':       
        i = i.strip(",\r\n")
        str_list.append(i)      

# step4. 印出選股資訊
# 現在各股的盤後資料已經被切割成一筆一筆的串列，為了後續的選股方便，利用pandas的read_csv( )將資料轉換成dataframe格式。不過須特別注意的是
# ，因為前面字串切割時已經把換行字元去除，因此得先透過join( )把串列透過換行符號再次合併起來。

df = pd.read_csv(StringIO("\n".join(str_list))) 

# 顯示的資料的筆數若超過100筆，pandas默認只會顯示前後100筆，可能會造成數據顯示不完整的狀況。如果希望數據能完整顯示的話，可以將pandas的列數限制設成null。
pd.set_option('display.max_rows', None) #設定最大能顯示1000rows
df.head(150)

Unnamed: 0,證券代號,證券名稱,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌(+/-),漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比
0,1101,台泥,20677619,8130,800039881,38.05,38.9,38.05,38.85,+,0.8,38.85,2,38.90,252,10.67
1,1101B,台泥乙特,17250,18,891775,51.7,51.7,51.7,51.7,,0.0,51.7,2,51.80,15,0.0
2,1102,亞泥,16124722,6971,630128210,38.05,39.9,38.0,39.5,+,1.5,39.45,26,39.50,253,12.08
3,1103,嘉泥,750946,259,10526935,13.95,14.1,13.9,14.1,+,0.15,14.05,1,14.10,99,4.98
4,1104,環泥,161915,160,3191269,19.8,19.8,19.65,19.7,-,0.1,19.7,14,19.75,2,11.26
5,1108,幸福,120151,52,917201,7.6,7.66,7.6,7.65,+,0.05,7.65,16,7.67,4,0.0
6,1109,信大,92507,51,1277044,13.8,13.95,13.7,13.85,+,0.05,13.8,9,13.85,1,6.53
7,1110,東泥,91050,33,1491172,16.4,16.45,16.3,16.35,-,0.15,16.35,13,16.40,8,65.4
8,1201,味全,739848,363,17257363,23.7,23.7,23.15,23.3,-,0.25,23.3,23,23.35,33,7.37
9,1203,味王,50770,67,1327777,26.4,26.4,26.1,26.15,-,0.25,26.15,10,26.20,8,14.45


In [17]:
# 利用股票代號/股票名稱選取指定的股票
# 股票代號
index = list(df['證券代號']).index('9943')
df.loc[index:index]
# 股票名稱
index = list(df['證券名稱']).index('台積電')
df.loc[index:index]

Unnamed: 0,證券代號,證券名稱,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌(+/-),漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比
272,2330,台積電,27514502,11071,6710510004,242.5,245.5,241.0,245.5,+,3.5,245.0,109,245.5,991,18.13


In [18]:
# 挑選符合特定條件的股票
# 挑選本益比小於10的股票
# to_numeric 将参数转换为数字类型
# errors  ： {'ignore'，'raise'，'coerce'}，

# 默认为'raise'

# 如果为‘raise’，

# 则无效的解析将引发异常

# 如果为 ‘coerce’，

# 则将无效解析设置为NaN

# 如果为 ‘ignore’，

# 则无效的解析将返回输入

df[(pd.to_numeric(df['本益比'], errors='coerce') < 10) &
   (pd.to_numeric(df['本益比'], errors='coerce') > 0)]

Unnamed: 0,證券代號,證券名稱,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌(+/-),漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比
3,1103,嘉泥,750946,259,10526935,13.95,14.1,13.9,14.1,+,0.15,14.05,1,14.10,99,4.98
6,1109,信大,92507,51,1277044,13.8,13.95,13.7,13.85,+,0.05,13.8,9,13.85,1,6.53
8,1201,味全,739848,363,17257363,23.7,23.7,23.15,23.3,-,0.25,23.3,23,23.35,33,7.37
32,1305,華夏,2416397,987,55163831,22.65,22.9,22.65,22.9,+,0.25,22.9,27,22.95,132,7.58
35,1309,台達化,719159,306,7488033,10.45,10.55,10.3,10.5,+,0.05,10.45,29,10.50,16,5.5
37,1312,國喬,2640269,1036,63707128,24.15,24.2,24.0,24.1,,0.0,24.1,256,24.15,18,5.85
40,1314,中石化,12997651,3035,146850376,11.2,11.4,11.2,11.35,+,0.2,11.3,996,11.35,381,5.97
42,1316,上曜,7314048,2448,115640448,15.6,15.95,15.5,15.95,+,1.45,15.95,1799,--,0,2.89
51,1339,昭輝,40238,35,1439668,36.0,36.0,35.7,35.8,-,0.1,35.75,2,35.80,6,7.12
52,1340,勝悅-KY,486194,277,8984691,18.5,18.7,18.35,18.65,+,0.2,18.55,15,18.65,9,6.16


In [19]:
# 依特定欄位做排序
# 依照收盤價由低至高排序
df['收盤價'] = df['收盤價'].apply(pd.to_numeric, errors='coerce') 
df.sort_values(by='收盤價')

Unnamed: 0,證券代號,證券名稱,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌(+/-),漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比
906,910482,聖馬丁-DR,189484,19,52209,0.26,0.28,0.26,0.27,,0.0,0.27,16,0.28,75,0.0
913,911619,耀傑-DR,60000,1,24000,0.40,0.40,0.40,0.4,-,0.04,--,0,0.40,165,0.0
376,2475,華映,12975297,540,5851492,0.45,0.46,0.45,0.45,-,0.05,--,0,0.45,2107,0.0
919,9157,陽光能源-DR,7999,7,5618,0.71,0.71,0.71,0.71,,0.0,0.67,13,0.71,24,0.0
912,911616,杜康-DR,242000,43,218990,0.93,0.93,0.89,0.92,-,0.01,0.89,21,0.92,38,0.0
811,6289,華上,156751,55,169411,1.11,1.13,1.03,1.11,+,0.01,1.08,14,1.11,6,6.94
911,911608,明輝-DR,113000,18,127270,1.13,1.14,1.11,1.13,-,0.01,1.13,20,1.15,20,0.0
904,9103,美德醫療-DR,175000,34,200800,1.15,1.15,1.14,1.15,-,0.01,1.15,28,1.16,77,0.0
915,911868,同方友友-DR,19000,4,22320,1.17,1.21,1.17,1.19,+,0.01,1.18,1,1.19,1,0.0
593,3383,新世紀,170796,26,234729,1.40,1.40,1.34,1.35,-,0.03,1.35,29,1.37,12,0.0


In [21]:
import requests
from bs4 import BeautifulSoup


# 網址後方加上 MARKET:STOCK_ID 即為個股資訊. e.g, TPE:2330
G_FINANCE_URL = 'https://www.google.com/search?q='


def get_web_page(url, stock_id):
    ##浏览器请求头（大部分网站没有这个请求头会报错、请务必加上哦）
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/66.0.3359.181 Safari/537.36'}
    resp = requests.get(url + stock_id, headers=headers)
    if resp.status_code != 200:
        print('Invalid url:', resp.url)
        return None
    else:
        return resp.text


def get_stock_info(dom):
    soup = BeautifulSoup(dom, 'html.parser')
    stock = dict()

    sections = soup.find_all('g-card-section')
    
    # 第 2 個 g-card-section, 取出公司名及即時股價資訊
    stock['name'] = sections[1].div.text
    # recursive=False不使用遞迴搜尋，僅尋找次一層的子節點
    spans = sections[1].find_all('div', recursive=False)[1].find_all('span', recursive=False)
    stock['current_price'] = spans[0].text
    stock['current_change'] = spans[1].text

    # 第 4 個 g-card-section, 有左右兩個 table 分別存放股票資訊
    for table in sections[3].find_all('table'):
        for tr in table.find_all('tr')[:3]:
            key = tr.find_all('td')[0].text.lower().strip()
            value = tr.find_all('td')[1].text.strip()
            stock[key] = value

    return stock


if __name__ == '__main__':
    page = get_web_page(G_FINANCE_URL, 'TPE:2330')
    if page:
        stock = get_stock_info(page)
        for k, v in stock.items():
            print(k, v)

name 台灣積體電路製造TPE: 2330
current_price 已收盤: 3月26日 上午11:29 [GMT+8] · 
current_change 免責聲明
開盤 279.50
最高 280.00
最低 275.50
殖利率 3.41%
上次收盤價 277.00
52 週高點 346.00


In [22]:
import requests
from bs4 import BeautifulSoup

def append_list_pm25():
    url = 'https://taqm.epa.gov.tw/pm25/tw/PM25A.aspx?area=1'
    html = requests.get(url)
    sp = BeautifulSoup(html.text, 'html.parser')
    rs = sp.find_all("tr", {"align": "center", "style": "border-width:1px;border-style:Solid;"})
    for r in rs:
        name = r.find('a')
        pm25 = r.find_all('span')
        dic = {}
        dic.setdefault('name',   name.text.strip())
        dic.setdefault('pm25',   pm25[0].text.strip())
        dic.setdefault('pm25_1', pm25[1].text.strip())
        list.append(dic)

def get_pm25(name):
    for d in list:
        if d.get('name') == name:
            return d

list = []
append_list_pm25()
print(list)

name = input('請輸入地區 ? (例如:林口, 桃園) : ')
d = get_pm25(name)
print(d)
print(d.get('pm25'))

[{'name': '富貴角', 'pm25': '22', 'pm25_1': '27'}, {'name': '萬里', 'pm25': '27', 'pm25_1': '30'}, {'name': '淡水', 'pm25': '15', 'pm25_1': '25'}, {'name': '林口', 'pm25': '28', 'pm25_1': '31'}, {'name': '三重', 'pm25': '28', 'pm25_1': '31'}, {'name': '菜寮', 'pm25': '21', 'pm25_1': '24'}, {'name': '汐止', 'pm25': '13', 'pm25_1': '24'}, {'name': '新莊', 'pm25': '17', 'pm25_1': '29'}, {'name': '永和', 'pm25': '17', 'pm25_1': '21'}, {'name': '板橋', 'pm25': '21', 'pm25_1': '25'}, {'name': '土城', 'pm25': '18', 'pm25_1': '22'}, {'name': '新店', 'pm25': '', 'pm25_1': '23'}, {'name': '陽明', 'pm25': '20', 'pm25_1': '18'}, {'name': '士林', 'pm25': '14', 'pm25_1': '22'}, {'name': '大同', 'pm25': '22', 'pm25_1': '27'}, {'name': '中山', 'pm25': '17', 'pm25_1': '26'}, {'name': '松山', 'pm25': '22', 'pm25_1': '26'}, {'name': '萬華', 'pm25': '23', 'pm25_1': '22'}, {'name': '古亭', 'pm25': '13', 'pm25_1': '15'}, {'name': '基隆', 'pm25': '20', 'pm25_1': '27'}, {'name': '大園', 'pm25': '21', 'pm25_1': '27'}, {'name': '觀音', 'pm25': '20', 'pm25