# 데이터 크롤링과 정제
 크롤링 시작하기

In [1]:
# 임의의 위키 페이지에서 모든 링크 목록 가져오기
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')

url_links = bs.find_all('a')
print('Kevin Bacon 링크수:', len(url_links))

for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])

Kevin Bacon 링크수: 915
/wiki/Wikipedia:Protection_policy#semi
#mw-head
#searchInput
/wiki/Kevin_Bacon_(disambiguation)
/wiki/File:Kevin_Bacon_SDCC_2014.jpg
/wiki/Philadelphia,_Pennsylvania
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
#cite_note-1
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
/wiki/Wikipedia:Citation_needed
http://baconbros.com/
#cite_note-2
#cite_note-actor-3
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Balto_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Animal_House
/wiki/Diner_(1982_film)
/wiki/Tremors_(1990_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Friday_the_13th_(1980_film)
/wiki/Flatliners
/wiki/The_River_Wild
/wiki/Wild_Things_(film)
/wiki/Stir_of_Echoes
/wiki/Hollow_Man
/wiki/Frost/Nixon_(film)
/wiki/X-Men:_First_Class
/wiki/Black_Mass_(film)
/wiki/Patriots_Day_(film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/

In [2]:
# 항목 링크 찾기
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen('https://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
body_content = bs.find('div', {'id': 'bodyContent'})
#
# ^(정규식 시작)... $(정규식 끝)
# (/wiki/): '/wiki/를 포함
# ((?!:).)*: ':' 콜론이 없는 문자열 및 임의의 문자가 0회 이상 반복되는 문자열 검색
pattern = '^(/wiki/)((?!:).)*$'
for link in body_content.find_all('a', href=re.compile(pattern)):
    if 'href' in link.attrs:
        print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia,_Pennsylvania
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Balto_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Animal_House
/wiki/Diner_(1982_film)
/wiki/Tremors_(1990_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Friday_the_13th_(1980_film)
/wiki/Flatliners
/wiki/The_River_Wild
/wiki/Wild_Things_(film)
/wiki/Stir_of_Echoes
/wiki/Hollow_Man
/wiki/Frost/Nixon_(film)
/wiki/Black_Mass_(film)
/wiki/Patriots_Day_(film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/Streaming_television
/wiki/I_Love_Dick_(TV_series)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or

In [3]:
# 링크간 무작위 이동하기: 소스 코드
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html = urlopen('https://en.wikipedia.org{}'.format(articleUrl))
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id': 'bodyContent'}).find_all('a',
    href=re.compile('^(/wiki/)((?!:).)*$'))

links = getLinks('/wiki/Kevin_Bacon')
while (len(links)) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(datetime.datetime.now())


/wiki/Colin_Firth
/wiki/George_Clooney
/wiki/Robin_Williams
/wiki/Deconstructing_Harry
/wiki/Wild_Man_Blues
/wiki/Buena_Vista_Social_Club_(film)
/wiki/13th_(film)
/wiki/Robert_Glasper
/wiki/Charles_Fox_(composer)
/wiki/List_of_Daytona_500_broadcasters
/wiki/Danielle_Trotta
/wiki/NBC_Sports_Boston
/wiki/WVNY
/wiki/Boston_Red_Sox
/wiki/Tim_Wakefield
/wiki/Cliff_Chambers
/wiki/St._Louis_Cardinals
/wiki/Paul_DeJong
/wiki/Rawlings_Gold_Glove_Award
/wiki/Scott_Rolen
/wiki/Doug_Rader
/wiki/List_of_Chicago_White_Sox_managers
/wiki/1994_Chicago_White_Sox_season
/wiki/1911_Chicago_White_Sox_season
/wiki/2007_Chicago_White_Sox_season
/wiki/Joe_Borowski_(baseball)
/wiki/2003_World_Series
/wiki/The_Yankee_Years
/wiki/Bill_Dickey
/wiki/Bob_Grim_(baseball)
/wiki/Jos%C3%A9_Canseco
/wiki/Jeremy_Hellickson
/wiki/Major_League_Baseball_Rookie_of_the_Year_Award
/wiki/1956_Major_League_Baseball_season
/wiki/1908_Major_League_Baseball_season
/wiki/Run_(baseball)
/wiki/Golden_sombrero
/wiki/2018_American_Leag

/wiki/ISNI_(identifier)
/wiki/Process_Specification_Language
/wiki/ISO_128
/wiki/OCR-B
/wiki/Optical_Character_Recognition
/wiki/Action_selection
/wiki/Cognitive_science
/wiki/Evolution_of_human_intelligence
/wiki/ISBN_(identifier)
/wiki/Barcode_system
/wiki/Wm._Wrigley_Jr._Company
/wiki/Cardinals%E2%80%93Cubs_rivalry
/wiki/1981_Major_League_Baseball_strike
/wiki/1990_Major_League_Baseball_lockout
/wiki/Free_agent
/wiki/Notts_County_F.C.
/wiki/Gary_Liddle
/wiki/Coventry_City_F.C.
/wiki/Gerry_Daly
/wiki/Mike_Connell_(soccer)
/wiki/Alan_Hudson
/wiki/Mike_Connell_(soccer)
/wiki/Defender_(association_football)
/wiki/Henrik_Larsson
/wiki/Pontus_K%C3%A5mark
/wiki/Sweden
/wiki/Finland
/wiki/Vaasa
/wiki/Salo,_Finland
/wiki/Lohja
/wiki/H%C3%A4meenlinna
/wiki/Rauma,_Finland
/wiki/Nurmij%C3%A4rvi
/wiki/Oulu
/wiki/Piippola
/wiki/Oulainen
/wiki/Sievi
/wiki/Taivalkoski
/wiki/Paavola
/wiki/Time_zone
/wiki/Kingman_Reef
/wiki/United_States_Civil_Administration_of_the_Ryukyu_Islands
/wiki/29th_parallel_

URLError: <urlopen error [WinError 10060] 연결된 구성원으로부터 응답이 없어 연결하지 못했거나, 호스트로부터 응답이 없어 연결이 끊어졌습니다>

In [None]:
# 같은 페이지를 두 번 크롤링 하지 않기
# getLinks() 함수 수정: set 사용
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set() # 세트 선언
def getLinks(pageUrl):
    global pages
    html = urlopen('https://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href = re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)

getLinks('')