## 폴더에 파일 저장하기

In [1]:
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
# 원격 url의 파일 내려받기
html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
imageLocation = bs.find('a', {'id':'logo'}).find('img')['src']
urlretrieve(imageLocation, 'logo.jpg')

('logo.jpg', <http.client.HTTPMessage at 0x2257eb30730>)

In [17]:
import os

downloadDirectory = 'downloaded'
baseUrl = 'http://pythonscraping.com'

def getAbsoluteURL(baseUrl, source):
    if source.startswith('http://www.'):
        url = 'http://{}'.format(source[11:])
    elif source.startswith('https://') or source.startswith('http://'):
        url = source
    elif source.startswith('www.'):
        url = source[4:]
        url = 'http://{}'.format(source)
    else:
        url = '{}/{}'.format(baseUrl, source)
    if baseUrl not in url:
        return None
    return url

In [18]:
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
    path = absoluteUrl.replace('www', '')
    path = path.replace(baseUrl, '')
    path = downloadDirectory + path
    directory = os.path.dirname(path)
    
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    return path

In [19]:
# src 속성이 있는 태그에 연결된 내부 파일을 모두 내려받습니다.

html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
downloadList = bs.find_all(src=True)

for download in downloadList:
    fileUrl = getAbsoluteURL(baseUrl, download['src'])
    if fileUrl is not None:
        print(fileUrl)
        if  fileUrl[-4:] == '.jpg':
            urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))

http://pythonscraping.com/misc/jquery.js?v=1.4.4
http://pythonscraping.com/misc/jquery.once.js?v=1.2
http://pythonscraping.com/misc/drupal.js?q4na2g
http://pythonscraping.com/sites/all/themes/skeletontheme/js/jquery.mobilemenu.js?q4na2g
http://pythonscraping.com/sites/all/modules/google_analytics/googleanalytics.js?q4na2g
http://pythonscraping.com/sites/default/files/lrg_0.jpg
http://pythonscraping.com/img/lrg%20(1).jpg


## csv로 저장하기 

In [20]:
import csv

csvFile = open('test.csv', 'w+')

try:
    writer = csv.writer(csvFile)
    writer.writerow(('number', 'number plus 2', 'number times 2'))
    for i in range(10):
        writer.writerow((i, i+2, i*2))
finally:
    csvFile.close()

In [24]:
# html 테이블로 csv 파일 만들기

html = urlopen('http://en.wikipedia.org/wiki/Comparison_of_text_editors')
bs = BeautifulSoup(html, 'html.parser')
# 첫 번째 테이블로 만들기
table = bs.findAll('table', {'class':'wikitable'})[0]
rows = table.findAll('tr')

csvFile = open('editors.csv', 'wt+', encoding='utf-8')
writer = csv.writer(csvFile)

try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['td', 'th']):
            csvRow.append(cell.get_text())
        writer.writerow(csvRow)
finally:
    csvFile.close()

## MySQL과 연동하기

```sql
CREATE DATABASE scraping;

USE scraping;

CREATE TABLE pages (
    id BIGINT(7) NOT NULL AUTO_INCREMENT,
    title VARCHAR(200),
    content VARCHAR(10000),
    created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY(id));
    
INSERT INTO pages (title, content) VALUES ("Test page title", "This is some test page content. It can be up to 10,000 characters long.");

```

In [25]:
import pymysql

conn = pymysql.connect(host='127.0.0.1',
                      user='root', passwd='1234', db='mysql')
cur = conn.cursor()
cur.execute("USE scraping")
cur.execute("SELECT * FROM pages WHERE id=1")
print(cur.fetchone())

cur.close()
conn.close()

(1, 'test page title', 'this is some test page content. It can be up to 10,000 characters long.', datetime.datetime(2021, 7, 19, 15, 50, 15))


MySQL은 기본적으로 유니코드를 처리하지 않습니다. 다행히 이 기능을 켤 수는 있습니다.

유니코드긴 하지만 지원이 형편없는 utf8mb4에서 utf8mb4_unicode_ci로 바꿉니다.

```sql
ALTER DATABASE scraping CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci;

ALTER TABLE pages CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;

ALTER TABLE pages CHANGE title title VARCHAR(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;

ALTER TABLE pages CHANGE content content VARCHAR(10000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
```

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import pymysql
import re

In [3]:
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='1234', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute('USE scraping')

def store(title, content):
    cur.execute('INSERT INTO pages (title, content) VALUES ("%s", "%s")',
               (title, content))
    cur.connection.commit()

In [4]:
def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org' + articleUrl)
    bs = BeautifulSoup(html, 'html.parser')
    title = bs.find('h1').get_text()
    content = bs.find('div', {'id':'mw-content-text'}).find('p').get_text()
    store(title, content)
    return bs.find('div', {'id':'bodyContent'}).findAll('a', href=re.compile('^(/wiki/)((?!:).)*$'))

In [38]:
random.seed(datetime.datetime.now())

links = getLinks('/wiki/Kevin_Bacon')
try:
    while len(links) > 0:
        newArticle = links[random.randint(0, len(links)-1)].attrs['href']
        print(newArticle)
        links = getLinks(newArticle)
finally:
    cur.close()
    conn.close()

/wiki/Mickey_Rooney
/wiki/The_Beast_of_the_City
/wiki/Mismates
/wiki/Red,_White_and_Blue_Blood
/wiki/Babette_(film)
/wiki/New_Morals_for_Old
/wiki/Charles_Brabin
/wiki/Theda_Bara
/wiki/Josephine_Baker
/wiki/Stork_Club
/wiki/Open_access
/wiki/National_Institutes_of_Health
/wiki/MRNA-1273
/wiki/Hib_vaccine
/wiki/Pneumonia
/wiki/COPD
/wiki/PMID_(identifier)
/wiki/Doi_(identifier)
/wiki/Publications_Office_(European_Union)
/wiki/Diplomatic_missions_of_the_European_Commission
/wiki/List_of_diplomatic_missions_of_Europe
/wiki/European_Union_Operations_Centre
/wiki/Kortenberg_building
/wiki/European_Organisation_of_Military_Associations
/wiki/Luxembourg_Army#Luxembourg_Army_Air_Force
/wiki/EUBG_2014_II
/wiki/Luxembourg
/wiki/Holy_Roman_Empire
/wiki/Francia
/wiki/Provisional_Government_of_the_French_Republic
/wiki/Free_France
/wiki/Timeline_of_World_War_II_(1943)
/wiki/Military_history_of_the_Philippines_during_World_War_II
/wiki/First_Indochina_War
/wiki/Greek_Civil_War
/wiki/Parataxis
/wiki/

KeyboardInterrupt: 

## 보다 효율적으로 크롤링하기

테이블 두 개 만들기
1. 페이지 목록
2. 두 페이지를 잇는 링크 목록

```sql
create DATABASE wikipedia;
USE wikipedia;

CREATE TABLE pages (
	id INT NOT NULL auto_increment,
    url varchar(255) not null,
    created timestamp not null default current_timestamp,
    primary key(id));

create table links (
	id int not null auto_increment,
    fromPageId int null,
    toPageId int null,
    created timestamp not null default current_timestamp,
    primary key(id));
    
commit;
```

In [12]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pymysql
from random import shuffle

In [13]:
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='1234', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute('USE wikipedia')

0

In [14]:
# 새 페이지를 발견할 때마다 저장(중복 방지)
def insertPageIfNotExists(url):
    cur.execute('SELECT * FROM pages WHERE url = %s', (url))
    if cur.rowcount == 0: # 새 페이지면
        cur.execute('INSERT INTO pages (url) VALUES (%s)', (url))
        conn.commit()
        # 마지막 로우 번호 반환
        return cur.lastrowid
    else:
        return cur.fetchone()[0]

In [15]:
# DB에 저장된 페이지 모두 반환
def loadPages():
    cur.execute('SELECT * FROM pages')
    pages = [row[1] for row in cur.fetchall()] # 칼럼 'url' 가져오기
    return pages

In [16]:
# 링크 기록하기
def insertLink(fromPageId, toPageId):
    cur.execute('SELECT * FROM links WHERE fromPageId = %s AND toPageId = %s',
                   (int(fromPageId), int(toPageId)))
    if cur.rowcount == 0:
        cur.execute('INSERT INTO links (fromPageId, toPageId) VALUES (%s, %s)',
                   (int(fromPageId), int(toPageId)))
        conn.commit()

In [17]:
# 링크 테이블에서 toPageId를 갖고 있는지 확인
def pageHasLinks(pageId):
    cur.execute('SELECT * FROM links WHERE fromPageId = %s', (int(pageId)))
    rowcount = cur.rowcount
    if rowcount == 0:
        return False
    return True

In [25]:
# 위키백과의 여섯 다리 문제
# 5번째 재귀일 때 중단
def getLinks(pageUrl, recursionLevel, pages):
    if recursionLevel > 4:
        return
    
    pageId = insertPageIfNotExists(pageUrl) # fromPageId
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    
    links = bs.findAll('a', href=re.compile('^(/wiki/)((?!:).)*$'))
    links = [link.attrs['href'] for link in links] 
    
    for link in links:
        linkId = insertPageIfNotExists(link) # toPageId
        insertLink(pageId, linkId)
        
        if not pageHasLinks(linkId): # fromPageId로 저장되어 있지 않으면
            print("PAGE HAS NO LINKS: {}".format(link))
            pages.append(link) # 페이지 리스트에 추가
            getLinks(link, recursionLevel+1, pages)

In [26]:
getLinks('/wiki/Kevin_Bacon', 0, loadPages())
cur.close()
conn.close()

PAGE HAS NO LINKS: /wiki/Kevin_Bacon_(disambiguation)
PAGE HAS NO LINKS: /wiki/Kevin_Bacon_(producer)
PAGE HAS NO LINKS: /wiki/Rotherham
PAGE HAS NO LINKS: /wiki/Rotherham_(disambiguation)
PAGE HAS NO LINKS: /wiki/Rotherham_(UK_Parliament_constituency)
PAGE HAS NO LINKS: /wiki/Metropolitan_Borough_of_Rotherham
PAGE HAS NO LINKS: /wiki/Rotherham,_New_Zealand
PAGE HAS NO LINKS: /wiki/Alan_Rotherham
PAGE HAS NO LINKS: /wiki/Arthur_Rotherham
PAGE HAS NO LINKS: /wiki/Gerard_Rotherham
PAGE HAS NO LINKS: /wiki/Hugh_Rotherham
PAGE HAS NO LINKS: /wiki/Joseph_Bryant_Rotherham
PAGE HAS NO LINKS: /wiki/Roland_Rotherham
PAGE HAS NO LINKS: /wiki/Thomas_Rotherham
PAGE HAS NO LINKS: /wiki/Edward_Rotheram
PAGE HAS NO LINKS: /wiki/Steve_Rotheram
PAGE HAS NO LINKS: /wiki/HMS_Rotherham_(H09)
PAGE HAS NO LINKS: /wiki/Baron_Rotherham
PAGE HAS NO LINKS: /wiki/Rotherham_United_F.C.
PAGE HAS NO LINKS: /wiki/Rotherham_child_sexual_exploitation_scandal
PAGE HAS NO LINKS: /wiki/Main_Page
PAGE HAS NO LINKS: /wiki/

PAGE HAS NO LINKS: /wiki/South_Yorkshire_Police
PAGE HAS NO LINKS: /wiki/Metropolitan_and_non-metropolitan_counties_of_England
PAGE HAS NO LINKS: /wiki/Ceremonial_counties_of_England
PAGE HAS NO LINKS: /wiki/Lord_Lieutenant_of_South_Yorkshire
PAGE HAS NO LINKS: /wiki/High_Sheriff_of_South_Yorkshire
PAGE HAS NO LINKS: /wiki/Derbyshire
PAGE HAS NO LINKS: /wiki/West_Yorkshire
PAGE HAS NO LINKS: /wiki/North_Yorkshire
PAGE HAS NO LINKS: /wiki/East_Riding_of_Yorkshire
PAGE HAS NO LINKS: /wiki/Lincolnshire
PAGE HAS NO LINKS: /wiki/Nottinghamshire
PAGE HAS NO LINKS: /wiki/Humberhead_Levels
PAGE HAS NO LINKS: /wiki/South_Yorkshire_Coalfield
PAGE HAS NO LINKS: /wiki/Peak_District
PAGE HAS NO LINKS: /wiki/Dark_Peak
PAGE HAS NO LINKS: /wiki/River_Dearne
PAGE HAS NO LINKS: /wiki/River_Rother,_South_Yorkshire
PAGE HAS NO LINKS: /wiki/River_Don,_South_Yorkshire
PAGE HAS NO LINKS: /wiki/South_and_West_Yorkshire_Green_Belt
PAGE HAS NO LINKS: /wiki/Green_belt_(United_Kingdom)
PAGE HAS NO LINKS: /wiki/Li

PAGE HAS NO LINKS: /wiki/Yorkshire_Wildlife_Park
PAGE HAS NO LINKS: /wiki/The_London_Gazette
PAGE HAS NO LINKS: /wiki/Wayback_Machine
PAGE HAS NO LINKS: /wiki/Office_of_the_Deputy_Prime_Minister
PAGE HAS NO LINKS: /wiki/Office_for_National_Statistics
PAGE HAS NO LINKS: /wiki/Office_for_National_Statistics
PAGE HAS NO LINKS: /wiki/Doi_(identifier)
PAGE HAS NO LINKS: /wiki/Doi_(identifier)
PAGE HAS NO LINKS: /wiki/Local_Government_Act_1972
PAGE HAS NO LINKS: /wiki/The_Times
PAGE HAS NO LINKS: /wiki/Wayback_Machine
PAGE HAS NO LINKS: /wiki/Wayback_Machine
PAGE HAS NO LINKS: /wiki/English_Heritage_Archive
PAGE HAS NO LINKS: /wiki/Curlie
PAGE HAS NO LINKS: /wiki/Ceremonial_counties_of_England
PAGE HAS NO LINKS: /wiki/City_of_Sheffield
PAGE HAS NO LINKS: /wiki/Metropolitan_Borough_of_Barnsley
PAGE HAS NO LINKS: /wiki/Metropolitan_Borough_of_Doncaster
PAGE HAS NO LINKS: /wiki/Metropolitan_Borough_of_Rotherham
PAGE HAS NO LINKS: /wiki/Askern
PAGE HAS NO LINKS: /wiki/Barnsley
PAGE HAS NO LINKS:

PAGE HAS NO LINKS: /wiki/Census_in_the_United_Kingdom
PAGE HAS NO LINKS: /wiki/Internet
PAGE HAS NO LINKS: /wiki/Office_for_National_Statistics
PAGE HAS NO LINKS: /wiki/England
PAGE HAS NO LINKS: /wiki/Wales
PAGE HAS NO LINKS: /wiki/General_Register_Office_for_Scotland
PAGE HAS NO LINKS: /wiki/Scotland
PAGE HAS NO LINKS: /wiki/Northern_Ireland_Statistics_and_Research_Agency
PAGE HAS NO LINKS: /wiki/Northern_Ireland
PAGE HAS NO LINKS: /wiki/UK_Statistics_Authority
PAGE HAS NO LINKS: /wiki/List_of_national_and_international_statistical_services
PAGE HAS NO LINKS: /wiki/John_Rickman_(parliamentary_official)
PAGE HAS NO LINKS: /wiki/Jil_Matheson
PAGE HAS NO LINKS: /wiki/Office_for_National_Statistics
PAGE HAS NO LINKS: /wiki/Lockheed_Martin_UK
PAGE HAS NO LINKS: /wiki/Census_Act_1920
PAGE HAS NO LINKS: /wiki/Lockheed_Martin_UK
PAGE HAS NO LINKS: /wiki/Aerospace_manufacturer
PAGE HAS NO LINKS: /wiki/Defense_(military)
PAGE HAS NO LINKS: /wiki/Information_security
PAGE HAS NO LINKS: /wiki/Lo

PAGE HAS NO LINKS: /wiki/Restrictions_on_geographic_data_in_China
PAGE HAS NO LINKS: /wiki/Geo_URI_scheme
PAGE HAS NO LINKS: /wiki/International_Terrestrial_Reference_System
PAGE HAS NO LINKS: /wiki/SRID
PAGE HAS NO LINKS: /wiki/Universal_Transverse_Mercator_coordinate_system
PAGE HAS NO LINKS: /wiki/Grid_reference
PAGE HAS NO LINKS: /wiki/Latitude
PAGE HAS NO LINKS: /wiki/Longitude
PAGE HAS NO LINKS: /wiki/Ordnance_Survey
PAGE HAS NO LINKS: /wiki/British_Isles
PAGE HAS NO LINKS: /wiki/Isle_of_Man
PAGE HAS NO LINKS: /wiki/Irish_grid_reference_system
PAGE HAS NO LINKS: /wiki/Ordnance_Survey_of_Ireland
PAGE HAS NO LINKS: /wiki/Ordnance_Survey_of_Northern_Ireland
PAGE HAS NO LINKS: /wiki/Universal_Transverse_Mercator_coordinate_system
PAGE HAS NO LINKS: /wiki/Channel_Islands
PAGE HAS NO LINKS: /wiki/Irish_Transverse_Mercator
PAGE HAS NO LINKS: /wiki/Military_Grid_Reference_System
PAGE HAS NO LINKS: /wiki/The_Wash
PAGE HAS NO LINKS: /wiki/North_Sea
PAGE HAS NO LINKS: /wiki/Lincolnshire
PAG

PAGE HAS NO LINKS: /wiki/Lieutenancy_area
PAGE HAS NO LINKS: /wiki/Counties_of_the_United_Kingdom
PAGE HAS NO LINKS: /wiki/List_of_counties_of_the_United_Kingdom
PAGE HAS NO LINKS: /wiki/Local_government_in_England
PAGE HAS NO LINKS: /wiki/History_of_local_government_in_England
PAGE HAS NO LINKS: /wiki/Subdivisions_of_England
PAGE HAS NO LINKS: /wiki/Regions_of_England
PAGE HAS NO LINKS: /wiki/Ceremonial_counties_of_England
PAGE HAS NO LINKS: /wiki/List_of_ceremonial_counties_of_England
PAGE HAS NO LINKS: /wiki/Metropolitan_and_non-metropolitan_counties_of_England
PAGE HAS NO LINKS: /wiki/Unitary_authorities_of_England
PAGE HAS NO LINKS: /wiki/List_of_English_districts
PAGE HAS NO LINKS: /wiki/Civil_parishes_in_England
PAGE HAS NO LINKS: /wiki/List_of_civil_parishes_in_England
PAGE HAS NO LINKS: /wiki/Historic_counties_of_England
PAGE HAS NO LINKS: /wiki/Local_government_in_Northern_Ireland
PAGE HAS NO LINKS: /wiki/Local_government_in_Northern_Ireland#History
PAGE HAS NO LINKS: /wiki/C

KeyboardInterrupt: 

## 이메일 활용하기

In [27]:
# SMTP 서버를 로컬에서 실행하기
import smtplib
from email.mime.text import MIMEText

msg = MIMEText('The body of the email is here')

msg['Subject'] = 'An Email Alert'
msg['From'] = 'ryan@pythonscraping.com'
msg['To'] = 'webmaster@pythonscraping.com'

s = smtplib.SMTP('localhost')
s.send_message(msg)
s.quit()

ConnectionRefusedError: [WinError 10061] 대상 컴퓨터에서 연결을 거부했으므로 연결하지 못했습니다

In [28]:
# 크리스마스일 때 메일 보내기
import smtplib
from email.mime.text import MIMEText
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time

def sendMail(subject, body):
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] ='christmas_alerts@pythonscraping.com'
    msg['To'] = 'ryan@pythonscraping.com'

    s = smtplib.SMTP('localhost')
    s.send_message(msg)
    s.quit()

bs = BeautifulSoup(urlopen('https://isitchristmas.com/'), 'html.parser')
while(bs.find('a', {'id':'answer'}).attrs['title'] == 'NO'):
    print('It is not Christmas yet.')
    time.sleep(3600) # 한 시간마다 웹사이트 체크
    bs = BeautifulSoup(urlopen('https://isitchristmas.com/'), 'html.parser')
    
sendMail('It\'s Christmas!', 
         'According to http://itischristmas.com, it is Christmas!')

ConnectionRefusedError: [WinError 10061] 대상 컴퓨터에서 연결을 거부했으므로 연결하지 못했습니다