## 폴더에 파일 저장하기

In [1]:
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
# 원격 url의 파일 내려받기
html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
imageLocation = bs.find('a', {'id':'logo'}).find('img')['src']
urlretrieve(imageLocation, 'logo.jpg')

('logo.jpg', <http.client.HTTPMessage at 0x2257eb30730>)

In [17]:
import os

downloadDirectory = 'downloaded'
baseUrl = 'http://pythonscraping.com'

def getAbsoluteURL(baseUrl, source):
    if source.startswith('http://www.'):
        url = 'http://{}'.format(source[11:])
    elif source.startswith('https://') or source.startswith('http://'):
        url = source
    elif source.startswith('www.'):
        url = source[4:]
        url = 'http://{}'.format(source)
    else:
        url = '{}/{}'.format(baseUrl, source)
    if baseUrl not in url:
        return None
    return url

In [18]:
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
    path = absoluteUrl.replace('www', '')
    path = path.replace(baseUrl, '')
    path = downloadDirectory + path
    directory = os.path.dirname(path)
    
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    return path

In [19]:
# src 속성이 있는 태그에 연결된 내부 파일을 모두 내려받습니다.

html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
downloadList = bs.find_all(src=True)

for download in downloadList:
    fileUrl = getAbsoluteURL(baseUrl, download['src'])
    if fileUrl is not None:
        print(fileUrl)
        if  fileUrl[-4:] == '.jpg':
            urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))

http://pythonscraping.com/misc/jquery.js?v=1.4.4
http://pythonscraping.com/misc/jquery.once.js?v=1.2
http://pythonscraping.com/misc/drupal.js?q4na2g
http://pythonscraping.com/sites/all/themes/skeletontheme/js/jquery.mobilemenu.js?q4na2g
http://pythonscraping.com/sites/all/modules/google_analytics/googleanalytics.js?q4na2g
http://pythonscraping.com/sites/default/files/lrg_0.jpg
http://pythonscraping.com/img/lrg%20(1).jpg


## csv로 저장하기 

In [20]:
import csv

csvFile = open('test.csv', 'w+')

try:
    writer = csv.writer(csvFile)
    writer.writerow(('number', 'number plus 2', 'number times 2'))
    for i in range(10):
        writer.writerow((i, i+2, i*2))
finally:
    csvFile.close()

In [24]:
# html 테이블로 csv 파일 만들기

html = urlopen('http://en.wikipedia.org/wiki/Comparison_of_text_editors')
bs = BeautifulSoup(html, 'html.parser')
# 첫 번째 테이블로 만들기
table = bs.findAll('table', {'class':'wikitable'})[0]
rows = table.findAll('tr')

csvFile = open('editors.csv', 'wt+', encoding='utf-8')
writer = csv.writer(csvFile)

try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['td', 'th']):
            csvRow.append(cell.get_text())
        writer.writerow(csvRow)
finally:
    csvFile.close()

## MySQL과 연동하기

```sql
CREATE DATABASE scraping;

USE scraping;

CREATE TABLE pages (
    id BIGINT(7) NOT NULL AUTO_INCREMENT,
    title VARCHAR(200),
    content VARCHAR(10000),
    created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY(id));
    
INSERT INTO pages (title, content) VALUES ("Test page title", "This is some test page content. It can be up to 10,000 characters long.");

```

In [25]:
import pymysql

conn = pymysql.connect(host='127.0.0.1',
                      user='root', passwd='1234', db='mysql')
cur = conn.cursor()
cur.execute("USE scraping")
cur.execute("SELECT * FROM pages WHERE id=1")
print(cur.fetchone())

cur.close()
conn.close()

(1, 'test page title', 'this is some test page content. It can be up to 10,000 characters long.', datetime.datetime(2021, 7, 19, 15, 50, 15))


MySQL은 기본적으로 유니코드를 처리하지 않습니다. 다행히 이 기능을 켤 수는 있습니다.

유니코드긴 하지만 지원이 형편없는 utf8mb4에서 utf8mb4_unicode_ci로 바꿉니다.

```sql
ALTER DATABASE scraping CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci;

ALTER TABLE pages CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;

ALTER TABLE pages CHANGE title title VARCHAR(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;

ALTER TABLE pages CHANGE content content VARCHAR(10000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
```

In [35]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import pymysql
import re

In [36]:
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='1234', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute('USE scraping')

def store(title, content):
    cur.execute('INSERT INTO pages (title, content) VALUES ("%s", "%s")',
               (title, content))
    cur.connection.commit()

In [37]:
def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org' + articleUrl)
    bs = BeautifulSoup(html, 'html.parser')
    title = bs.find('h1').get_text()
    content = bs.find('div', {'id':'mw-content-text'}).find('p').get_text()
    store(title, content)
    return bs.find('div', {'id':'bodyContent'}).findAll('a', href=re.compile('^(/wiki/)((?!:).)*$'))

In [38]:
random.seed(datetime.datetime.now())

links = getLinks('/wiki/Kevin_Bacon')
try:
    while len(links) > 0:
        newArticle = links[random.randint(0, len(links)-1)].attrs['href']
        print(newArticle)
        links = getLinks(newArticle)
finally:
    cur.close()
    conn.close()

/wiki/Mickey_Rooney
/wiki/The_Beast_of_the_City
/wiki/Mismates
/wiki/Red,_White_and_Blue_Blood
/wiki/Babette_(film)
/wiki/New_Morals_for_Old
/wiki/Charles_Brabin
/wiki/Theda_Bara
/wiki/Josephine_Baker
/wiki/Stork_Club
/wiki/Open_access
/wiki/National_Institutes_of_Health
/wiki/MRNA-1273
/wiki/Hib_vaccine
/wiki/Pneumonia
/wiki/COPD
/wiki/PMID_(identifier)
/wiki/Doi_(identifier)
/wiki/Publications_Office_(European_Union)
/wiki/Diplomatic_missions_of_the_European_Commission
/wiki/List_of_diplomatic_missions_of_Europe
/wiki/European_Union_Operations_Centre
/wiki/Kortenberg_building
/wiki/European_Organisation_of_Military_Associations
/wiki/Luxembourg_Army#Luxembourg_Army_Air_Force
/wiki/EUBG_2014_II
/wiki/Luxembourg
/wiki/Holy_Roman_Empire
/wiki/Francia
/wiki/Provisional_Government_of_the_French_Republic
/wiki/Free_France
/wiki/Timeline_of_World_War_II_(1943)
/wiki/Military_history_of_the_Philippines_during_World_War_II
/wiki/First_Indochina_War
/wiki/Greek_Civil_War
/wiki/Parataxis
/wiki/

KeyboardInterrupt: 