## 단일 도메인 내 이동

In [1]:
# 위키백과의 여섯 다리: 
# 에릭 아이들의 페이지에서 시작해 케빈 베이컨의 페이지에 닿는 최소한의 클릭 수

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

In [3]:
html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')

# 링크들은 id가 bodyContent인 div 안에 있으며, url에는 콜론이 없음
# url 모두 /wiki/로 시작함
for link in bs.find('div', {'id':'bodyContent'}).find_all(
    'a', href=re.compile('^(/wiki/)((?!:).)*$')):
    if 'href' in link.attrs:
        print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia,_Pennsylvania
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Balto_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Animal_House
/wiki/Diner_(1982_film)
/wiki/Tremors_(1990_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Friday_the_13th_(1980_film)
/wiki/Flatliners
/wiki/The_River_Wild
/wiki/Wild_Things_(film)
/wiki/Stir_of_Echoes
/wiki/Hollow_Man
/wiki/Frost/Nixon_(film)
/wiki/Black_Mass_(film)
/wiki/Patriots_Day_(film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/Streaming_television
/wiki/I_Love_Dick_(TV_series)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy
/wiki/The_Guardian
/wi

In [9]:
## 새 페이지에 항목 링크가 없을 때까지 새 페이지 안에 있는 링크들 중 랜덤 선택
## 아래 코드의 경우 링크 10개만 열기

import datetime
import random

def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all(
        'a', href=re.compile('^(/wiki/)((?!:).)*$'))

In [10]:
random.seed(datetime.datetime.now())

links = getLinks('/wiki/Kevin_Bacon')
cnt = 0
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    cnt+=1
    if cnt == 10: break
    links = getLinks(newArticle)

/wiki/She%27s_Having_a_Baby
/wiki/This_Woman%27s_Work
/wiki/Music_genre
/wiki/Songwriter
/wiki/List_of_independent_UK_record_labels
/wiki/White_label
/wiki/Entertainment_law
/wiki/Concordat
/wiki/Concordat_of_2004
/wiki/Concordat_of_1940


## 전체 사이트 크롤링

In [17]:
pages = set()
cnt = 0
def getLinks(pageUrl):
    global pages
    global cnt
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if cnt == 10: return
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                # 새로운 페이지
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                cnt += 1
                getLinks(newPage)

getLinks('')

/wiki/Wikipedia
/wiki/Wikipedia:Protection_policy#semi
/wiki/Wikipedia:Requests_for_page_protection
/wiki/Wikipedia:Requests_for_permissions
/wiki/Wikipedia:Protection_policy#extended
/wiki/Wikipedia:Lists_of_protected_pages
/wiki/Wikipedia:Protection_policy
/wiki/Wikipedia:Perennial_proposals
/wiki/Wikipedia:Reliable_sources/Perennial_sources
/wiki/Wikipedia:Reliable_sources


In [18]:
# 제목, 첫 번째 문단, 편집 링크 출력하고 다음 페이지로 가기

pages = set()
cnt = 0
def getLinks(pageUrl):
    global pages
    global cnt
    
    html = urlopen('http://en.wikipedia.org'+pageUrl)
    bs = BeautifulSoup(html, 'html.parser')
    
    try:
        print(bs.h1.get_text())
        print(bs.find(id = 'mw-content-text').findAll('p')[0])
        print(bs.find(id = 'ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! No worries though!')
        
    for link in bs.findAll('a', href=re.compile('^(/wiki/)')):
        if cnt == 10: return
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                print('-------------------\n' + newPage)
                pages.add(newPage)
                cnt += 1
                getLinks(newPage)

getLinks('')

Main Page
<p><b><a href="/wiki/Boeing_CH-47_Chinook_in_Australian_service" title="Boeing CH-47 Chinook in Australian service">In Australia, Boeing <span class="nowrap">CH-47</span> Chinook heavy-lift helicopters</a></b> have been operated by the <a href="/wiki/Australian_Defence_Force" title="Australian Defence Force">Australian Defence Force</a> (ADF) for most of the period since 1974. Twenty-six <a href="/wiki/Boeing_CH-47_Chinook" title="Boeing CH-47 Chinook">Chinooks</a> have entered Australian service, comprising twelve <span class="nowrap">CH-47C</span> variants, four <span class="nowrap">CH-47Ds</span> and ten <span class="nowrap">CH-47Fs</span>. They have been operated by both the <a href="/wiki/Royal_Australian_Air_Force" title="Royal Australian Air Force">Royal Australian Air Force</a> (RAAF) and the <a href="/wiki/Australian_Army" title="Australian Army">Australian Army</a>. Twelve <span class="nowrap">CH-47C</span> Chinooks were ordered in 1970 and entered service with the 

## 인터넷 크롤링

In [19]:
from urllib.parse import urlparse

In [20]:
# 페이지에서 발견된 내부 링크를 모두 목록으로 만듦
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    
    # /로 시작하는 링크 모두 찾기
    for link in bs.findAll('a', href=re.compile('^(/|.*' + includeUrl + ')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl + link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

In [21]:
# 페이지에서 발견된 외부 링크를 모두 목록으로 만듦
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    
    # 현재 url를 포함하지 않으면서 http나 www로 시작하는 링크 모두 찾기
    for link in bs.findAll('a',
                    href = re.compile('^(http|www)((?!' + excludeUrl + ').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

In [22]:
# 페이지의 외부링크 중 무작위로 가져옴 
# 없으면 아무 내부 링크가 들어가서 그 페이지의 외부링크 중 하나 가져옴
def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, 'html.parser')
    
    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print('No external links, looking around the site for one')
        domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)
        
        return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

In [23]:
cnt = 0
def followExternalOnly(startingSite):
    global cnt
    if cnt == 10: return
    
    externalLink = getRandomExternalLink(startingSite)
    print('Random external link is : {}'.format(externalLink))
    cnt += 1
    followExternalOnly(externalLink)

In [24]:
followExternalOnly('http://oreilly.com')

Random external link is : https://play.google.com/store/apps/details?id=com.safariflow.queue
Random external link is : https://payments.google.com/legaldocument?family=0.privacynotice&hl=en_US
Random external link is : https://www.google.com/policies/privacy/
Random external link is : https://policies.google.com/privacy?hl=ko&gl=kr
Random external link is : https://support.google.com/websearch?p=privpol_incognito&hl=ko
Random external link is : https://play.google.com/?hl=ko&tab=u8
Random external link is : https://support.google.com/googleplay?p=pff_parentguide
Random external link is : https://docs.google.com/spreadsheets/?usp=sheets_alc
Random external link is : https://www.youtube.com/channel/UC-VcHhNdm1soluxzo5Fm3fQ
Random external link is : https://policies.google.com/privacy?hl=ko
