## 基本爬蟲 : 文章標題、作者、時間...


In [1]:
import requests

In [2]:
url = 'https://www.ptt.cc/bbs/Soft_Job/index.html'

response = requests.get(url)
# print(response.text)

#把respone寫成html
if response.status_code == 200:
  with open('output.html','w', encoding='utf-8') as f:
    f.write(response.text)
  print('寫入成功')
else:
  print('沒抓到網頁')


寫入成功


- 反爬蟲
> 常見方法 : 檢查請求的 Header ( 偽裝成使用者 )、輸入驗證碼、滑動解鎖

In [3]:
#檢查請求的 Header
url = 'https://www.ptt.cc/bbs/Soft_Job/index.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'}

response = requests.get(url, headers=headers)

with open('output.html','w', encoding='utf-8') as f:
  f.write(response.text)


- 使用 BeautifulSoup 解析網頁
> BeautifulSoup 可分析 HTML 或 XML ，也可拿來修復未閉合標籤等錯誤文件。
- 輸出 json 格式

In [4]:
# !pip install beautifulsoup4

In [5]:
from bs4 import BeautifulSoup
import json

url = 'https://www.ptt.cc/bbs/Soft_Job/index.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'}
response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('div', class_='r-ent')
data_list = []

for a in articles:
  data = {}
  title = a.find('div', class_='title')
  if title and title.a:  #確認文章是否存在
    title = title.a.text
  else:
    title = 'title not found'
  data['標題'] = title

  popular = a.find('div', class_='nrec')
  if popular and popular.span:
    popular = popular.span.text
  else:
    popular = 'N/A'
  data['人氣'] = popular

  date = a.find('div', class_='date')
  if date:
    date = date.text
  else:
    date = 'N/A'
  data['日期'] = date

  data_list.append(data)

# print(data_list)
with open('ptt_soft_data.json','w', encoding='utf-8') as f:
    #將資料轉換
    json.dump(data_list, f, ensure_ascii=False, indent=4)  #indent:縮排
print('資料儲存成功 ptt_soft_data.json')

  # print(f'標題: {title} 人氣: {popular} 日期: {date}')


資料儲存成功 ptt_soft_data.json


- 輸出 excel 格式

In [6]:
# !pip install openpyxl

In [7]:
import pandas as pd

df = pd.DataFrame(data_list)
df.to_excel('ptt_soft.xlsx', index=False, engine='openpyxl')  #excel更容易閱讀、和做資料分析

## 偽裝 cookie、圖片下載
撰寫爬蟲的第一步是**觀察網頁它是如何取得資料的** !
- 爬蟲需使用 cookie 偽裝成使用者
> 進入網站前會先經過成年確認，透過 f12 開啟 network 可發現其中狀態碼為 302 ( 重新導向 )。進入 application 中進一步觀察 cookie。
- 將文章中的圖片下載

In [8]:
import requests
from bs4 import BeautifulSoup

In [9]:
url = 'https://www.ptt.cc/bbs/Beauty/M.1686997472.A.FDA.html'

headers = {'Cookie':'over18=1'}
response = requests.get(url, headers=headers)  #加上cookie
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   [正妹] 張景嵐 - 看板 Beauty - 批踢踢實業坊
  </title>
  <meta content="all" name="robots"/>
  <meta content="Ptt BBS 批踢踢" name="keywords"/>
  <meta content="https://i.imgur.com/eT8ofhi.jpg
https://i.imgur.com/OJrUPbr.jpg
https://i.imgur.com/MrZUajC.jpg
https://i.imgur.com/RrNJt1y.jpg
https://i.imgur.com/YGx57uU.jpg
" name="description"/>
  <meta content="Ptt 批踢踢實業坊" property="og:site_name"/>
  <meta content="[正妹] 張景嵐" property="og:title"/>
  <meta content="https://i.imgur.com/eT8ofhi.jpg
https://i.imgur.com/OJrUPbr.jpg
https://i.imgur.com/MrZUajC.jpg
https://i.imgur.com/RrNJt1y.jpg
https://i.imgur.com/YGx57uU.jpg
" property="og:description"/>
  <link href="https://www.ptt.cc/bbs/Beauty/M.1686997472.A.FDA.html" rel="canonical"/>
  <link href="//images.ptt.cc/bbs/v2.27/bbs-common.css" rel="stylesheet" type="text/css"/>
  <link href="//images.ptt.cc/bbs/v2.27/bb

In [10]:
spans = soup.find_all('span', class_='article-meta-value')  #因為span可能有多個，所以使用find_all方法
print(spans)
print(spans[2])
print(spans[2].text)

[<span class="article-meta-value">iwjwlmsc0707 (JEFF HSU)</span>, <span class="article-meta-value">Beauty</span>, <span class="article-meta-value">[正妹] 張景嵐</span>, <span class="article-meta-value">Sat Jun 17 18:24:28 2023</span>]
<span class="article-meta-value">[正妹] 張景嵐</span>
[正妹] 張景嵐


In [11]:
import os

title = spans[2].text
dir_name = f'images/{title}'  #建立用來儲存圖片的資料夾
if not os.path.exists(dir_name):
  os.makedirs(dir_name)  #建立資料夾

In [12]:
links = soup.find_all('a')  #找到網頁中的所有連結
allow_file_name = ['jpg','png','jpeg','gif']
for link in links:
  href = link.get('href')
  extention = href.split('.')[-1].lower()  #取得副檔名
  if extention in allow_file_name:
    print(f'file: {extention}')
    print(f'url: {href}')
  if not href:
    continue  #跳過
  # print(href)

file: jpg
url: https://i.imgur.com/eT8ofhi.jpg
file: jpg
url: https://i.imgur.com/OJrUPbr.jpg
file: jpg
url: https://i.imgur.com/MrZUajC.jpg
file: jpg
url: https://i.imgur.com/RrNJt1y.jpg
file: jpg
url: https://i.imgur.com/YGx57uU.jpg
file: jpg
url: https://i.imgur.com/EaKmudk.jpg


In [13]:
#將上述抓取圖片的內容整合
import requests
from bs4 import BeautifulSoup
import os


def main():
  url = 'https://www.ptt.cc/bbs/Beauty/M.1686997472.A.FDA.html'

  headers = {'Cookie':'over18=1'}
  response = requests.get(url, headers=headers)
  soup = BeautifulSoup(response.text, 'html.parser')
  spans = soup.find_all('span', class_='article-meta-value')

  title = spans[2].text
  dir_name = f'images/{title}'
  if not os.path.exists(dir_name):
    os.makedirs(dir_name)

  links = soup.find_all('a')
  allow_file_name = ['jpg','png','jpeg','gif']
  for link in links:
    href = link.get('href')
    if not href:
      continue  #跳過
    file_name = href.split('/')[-1]
    extention = href.split('.')[-1].lower()
    if extention in allow_file_name:
      print(f'file: {extention}')
      print(f'url: {href}')
      download_img(href, f'{dir_name}/{file_name}')


def download_img(url, save_path):
  print(f'正在下載圖片: {url}')
  response = requests.get(url)
  with open(save_path, 'wb') as file:  #因為要儲存圖片所以使用'wb'，b代表二進位(binary)
    file.write(response.content)
  print('-'*30)


#如果此程式作為主要的執行，才會觸發main()
if __name__ == '__main__':
  main()

file: jpg
url: https://i.imgur.com/eT8ofhi.jpg
正在下載圖片: https://i.imgur.com/eT8ofhi.jpg
------------------------------
file: jpg
url: https://i.imgur.com/OJrUPbr.jpg
正在下載圖片: https://i.imgur.com/OJrUPbr.jpg
------------------------------
file: jpg
url: https://i.imgur.com/MrZUajC.jpg
正在下載圖片: https://i.imgur.com/MrZUajC.jpg
------------------------------
file: jpg
url: https://i.imgur.com/RrNJt1y.jpg
正在下載圖片: https://i.imgur.com/RrNJt1y.jpg
------------------------------
file: jpg
url: https://i.imgur.com/YGx57uU.jpg
正在下載圖片: https://i.imgur.com/YGx57uU.jpg
------------------------------
file: jpg
url: https://i.imgur.com/EaKmudk.jpg
正在下載圖片: https://i.imgur.com/EaKmudk.jpg
------------------------------


## ajax 爬蟲
AJAX即「Asynchronous JavaScript and XML」（非同步的JavaScript與XML技術），是 Web 應用程式開發技術的組合，可讓 Web 應用程式更快速地回應使用者互動。
1. 使用者訪問網址
2. 回傳 HTML ( 無資料 )
3. JS 發出 AJAX 請求
4. 伺服器回傳資料，JS 渲染畫面

--> 我們需要知道瀏覽器中的 JavaScript 是透過哪幾支 API 獲得資料。

--> 將我們需要的 api 網址貼到線上的 json formatter 中更好的觀察各個欄位。

In [14]:
import requests

url = 'https://api.hahow.in/api/products/search?category=COURSE&filter=PUBLISHED&limit=24&page=0&sort=TRENDING'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
  data = response.json()
  products = data['data']['courseData']['products']
  course_list = []
  for product in products:  #印出每個課程名稱
    course_data = [
        product['title'],
        product['averageRating'],
        product['price'],
        product['numSoldTickets']
    ]
    course_list.append(course_data)
  print(course_list)
else:
  print('無法取得網頁')

[['掌握鉤針編織邏輯：五種針法自造生活小物', 4.98, 2580, 3960], ['《電商群募流》電商人必學千萬銷售心法', 0, 7960, 761], ['戰爭來了怎麼辦？黑熊學院民防基礎線上課', 4.97, 2300, 3012], ['Notion 實戰課程：打造專屬數位工作術', 4.95, 3380, 11193], ['用 Python 理財：打造小資族選股策略', 4.96, 3200, 11747], ['產品設計實戰：用Figma打造絕佳UI/UX', 4.99, 4280, 7717], ['用戶x商業x數據 | UX‧三刀流產品體驗設計', 4.91, 3666, 1122], ['占星之門安格斯｜十二宮位找出人生攻略', 4.92, 2580, 3455], ['人人必修動力鏈-強化身體弱連結', 5, 4200, 2240], ['設計師接案學－業界求生必備守則', 4.96, 1800, 5379], ['讓插畫走入生活－Procreate 文創物實作！', 0, 2880, 1487], ['Python 入門特訓 - 基礎實作到證照攻略', 4.98, 1790, 5523], ['跟 YouTuber 莫彩曦學美國道地的說話習慣', 4.92, 1900, 29394], ['曾寶儀——從心開始的溝通術', 4.97, 1900, 1867], ['AI 時代人人必學：用 ChatGPT 全面升級你的工作流程', 4.8, 3280, 1205], ['邏輯表達力-周震宇的口語邏輯優化課', 4.96, 3680, 5274], ['Canva 入門到進階實戰｜零基礎做質感設計', 5, 2780, 1060], ['RyuuuTV看動漫看日劇，零到N4道地說日文', 4.97, 3500, 9196], ['聲入人心-周震宇的人聲必修課', 4.98, 3980, 8438], ['Excel 新手入門必修課：從 0 開始', 4.99, 990, 3568], ['直球對決你的工作焦慮！劉軒的 50 堂職場心理學', 5, 8000, 2027], ['電商人妻 IG 增粉攻略！三大領域經營術', 4.97, 3980, 5143], ['AutoCAD 電腦繪圖2D基礎篇－紮實入門', 4.96, 2500, 36

In [15]:
import pandas

#用pandas儲存成excel時，每個資料須為一個列表
df = pd.DataFrame(course_list, columns=['課程名稱','評價','價格','購買人數'])
df.to_excel('course.xlsx', index=False, engine='openpyxl')

## 電影爬蟲
- 點擊下一頁

In [54]:
import requests
from bs4 import BeautifulSoup
import time

data_list = []

def fetch_data(url):
  requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'DES-CBC3-SHA'
  response = requests.get(url, headers=headers)
  soup = BeautifulSoup(response.text, 'html.parser')
  movies = soup.find_all('div', class_='row movie-list bottommargin-sm')
  for movie in movies:
    name = movie.find('div', class_='col-md-10 col-sm-9')
    name = name.a.text.strip()  #strip可刪除開頭或結尾的多餘空格
    info = movie.find('ul', class_='entry-meta clearfix').find_all('li')  #找出所有li標籤
    date = info[0].text.replace('上映日期：', '').strip()
    length = info[1].text.replace('片長：', '').strip()
    view = info[3].text.replace('瀏覽人數：', '').strip()
    data_list.append([name, date, length, view])
  print(data_list)

  next_page = soup.find('ul', class_='pagination').find_all('li')[5]
  if next_page and next_page.a:
    next_url = next_page.a.get('href')
    print(f'正在爬取{next_url}')
    time.sleep(2)
    fetch_data('https://www.venice-cinemas.com.tw/movie.php'+next_url)


url = 'https://www.venice-cinemas.com.tw/movie.php?state=1&webpage=2'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'}
fetch_data(url)


[['還錢', '2024-02-08', '1時49分', '912人'], ['飛鴨向前衝', '2024-02-08', '1時32分', '502人'], ['小子', '2024-02-08', '1時41分', '599人'], ['青春養成記', '2024-02-08', '1時40分', '310人'], ['臨時劫案', '2024-02-08', '1時38分', '700人']]
正在爬取?state=1&page=2
[['還錢', '2024-02-08', '1時49分', '912人'], ['飛鴨向前衝', '2024-02-08', '1時32分', '502人'], ['小子', '2024-02-08', '1時41分', '599人'], ['青春養成記', '2024-02-08', '1時40分', '310人'], ['臨時劫案', '2024-02-08', '1時38分', '700人'], ['鬼太郎誕生：咯咯咯之謎', '2024-02-07', '1時45分', '765人'], ['Fukuyama Masaharu Live Film Kotodama no Sakiwau Natsu Boy Meets The Music @Nippon Budokan 2023', '2024-02-06', '2時17分', '471人'], ['愛愛愛上你', '2024-02-02', '1時43分', '1,153人'], ['劇場版SPY × FAMILY 間諜家家酒CODE: White', '2024-02-02', '1時50分', '8,973人'], ['機密特務：阿蓋爾', '2024-01-31', '2時19分', '1,539人']]
正在爬取?state=1&page=3
[['還錢', '2024-02-08', '1時49分', '912人'], ['飛鴨向前衝', '2024-02-08', '1時32分', '502人'], ['小子', '2024-02-08', '1時41分', '599人'], ['青春養成記', '2024-02-08', '1時40分', '310人'], ['臨時劫案', '2024-02-08', '1時38分', '700人'], ['鬼太郎誕生

In [55]:
import pandas

df = pd.DataFrame(data_list, columns=['片名','上映日期','片長','觀看人數'])
df.to_excel('上映中.xlsx', index=False, engine='openpyxl')