# AAI614: Data Science & its Applications

*Notebook 3.1: Practice with Data Collections*

<a href="https://colab.research.google.com/github/harmanani/AAI614/blob/main/Week%203/Notebook3.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Source: Scraping with Python http://shop.oreilly.com/product/0636920034391.do

In [147]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

html = urlopen('https://en.wikipedia.org/wiki/Tesla,_Inc.')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=Tesla%2C+Inc.
/w/index.php?title=Special:UserLogin&returnto=Tesla%2C+Inc.
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=Tesla%2C+Inc.
/w/index.php?title=Special:UserLogin&returnto=Tesla%2C+Inc.
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#History
#Founding_(2003–2004)
#Roadst

## Retrieving Articles Only

In [149]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen('https://en.wikipedia.org/wiki/Tesla,_Inc.')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all(
    'a', href=re.compile('^(/wiki/)((?!:).)*$')):
    print(link.attrs['href'])

/wiki/Geographic_coordinate_system
/wiki/Induction_motor
/wiki/AC_motor
/wiki/Tesla_a.s.
/wiki/Tesla_(disambiguation)
/wiki/Gigafactory_Texas
/wiki/Headquarters
/wiki/Austin,_Texas
/wiki/Public_company
/wiki/Ticker_symbol
/wiki/Nasdaq
/wiki/Nasdaq-100
/wiki/S%26P_100
/wiki/S%26P_500
/wiki/International_Securities_Identification_Number
/wiki/Automotive_industry
/wiki/Renewable_energy_industry
/wiki/San_Carlos,_California
/wiki/Martin_Eberhard
/wiki/Marc_Tarpenning
/wiki/Austin,_Texas
/wiki/Tesla_Supercharger
/wiki/Robyn_Denholm
/wiki/Chair_(officer)
/wiki/Elon_Musk
/wiki/Chief_executive_officer
/wiki/Tesla_Model_S
/wiki/Tesla_Model_X
/wiki/Tesla_Model_3
/wiki/Tesla_Model_Y
/wiki/Tesla_Semi
/wiki/Tesla_Cybertruck
/wiki/Tesla_Powerwall
/wiki/Tesla_Megapack
/wiki/Tesla_solar_panels
/wiki/Tesla_Solar_Roof
/wiki/Tesla_Supercharger
/wiki/United_States_dollar
/wiki/Earnings_before_interest_and_taxes
/wiki/Net_income
/wiki/Asset
/wiki/Equity_(finance)
/wiki/Subsidiary
/wiki/Tesla_Automation
/wi

## Random Walk

In [159]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

#the random seed initialization
random.seed(datetime.datetime.now().timestamp())

def getLinks(articleUrl):
    # Open the webpage
    html = urlopen(f'http://en.wikipedia.org{articleUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    
    # Find all relevant links
    return bs.find('div', {'id': 'bodyContent'}).find_all(
        'a', href=re.compile('^(/wiki/)((?!:).)*$')
    )

# Starting URL
links = getLinks('/wiki/Tesla,_Inc.')

# Limit the number of iterations to avoid infinite crawling
limit = 10
count = 0

# Crawl through links
while len(links) > 0 and count < limit:
    # Select a random link
    newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
    print(f"http://en.wikipedia.org{newArticle}")
    
    # Get new links from the selected article
    links = getLinks(newArticle)
    count += 1

http://en.wikipedia.org/wiki/Actuate_Corporation
http://en.wikipedia.org/wiki/United_States_Dollar
http://en.wikipedia.org/wiki/Euro
http://en.wikipedia.org/wiki/Eritrean_nakfa
http://en.wikipedia.org/wiki/Informal_economy
http://en.wikipedia.org/wiki/Open_access
http://en.wikipedia.org/wiki/State_ownership
http://en.wikipedia.org/wiki/Primitive_accumulation_of_capital
http://en.wikipedia.org/wiki/R._Palme_Dutt
http://en.wikipedia.org/wiki/Jersey_Communist_Party


## Recursively crawling an entire site

In [161]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

# Store visited pages
pages = set()

def getLinks(pageUrl, depth=0, max_depth=3, max_pages=100):
    """
    Crawl links up to a certain depth and page limit.
    
    Args:
        pageUrl (str): The starting Wikipedia URL.
        depth (int): Current recursion depth.
        max_depth (int): Maximum recursion depth allowed.
        max_pages (int): Maximum number of pages to crawl.
    """
    global pages
    # Stop crawling if limits are exceeded
    if depth > max_depth or len(pages) >= max_pages:
        return
    
    try:
        html = urlopen(f'http://en.wikipedia.org{pageUrl}')
        bs = BeautifulSoup(html, 'html.parser')
        
        # Find all valid links
        for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
            if 'href' in link.attrs:
                newPage = link.attrs['href']
                if newPage not in pages:
                    # Add the new page to the set of visited pages
                    pages.add(newPage)
                    print(newPage)
                    # Recursively crawl the new page
                    getLinks(newPage, depth + 1, max_depth, max_pages)
    except Exception as e:
        print(f"An error occurred: {e}")

# Start crawling from the main page with limits
getLinks('/wiki/Main_Page', max_depth=3, max_pages=50)


/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Special:Search
/wiki/Special:MyContributions
/wiki/Special:MyTalk
/wiki/Portal_talk:Current_events
/wiki/Special:WhatLinksHere/Portal:Current_events
/wiki/Special:RecentChangesLinked/Portal:Current_events
/wiki/Wikipedia:File_Upload_Wizard
/wiki/Special:SpecialPages
/wiki/Portal:Current_events/Edit_instructions
/wiki/Wikipedia:Protection_policy#full
/wiki/File:Darkgreen_flag_waving.svg
/wiki/Special:PrefixIndex/Portal:Current_events/
/wiki/Wikipedia:CAREFUL
/wiki/Wikipedia:ASSISTED
/wiki/Template:Portal_maintenance_status#How_to_update_the_maintenance_information_for_a_portal
/wiki/File:Ambox_globe_Americas.svg
/wiki/Portal:Current_events/Sports
/wiki/Deaths_in_2024
/wiki/Wikipedia:Top_25_Report
/wiki/File:Samantha_Harvey.jpg
/wiki/

## Collecting Data Across an Entire Site

In [163]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    html = urlopen(f'http://en.wikipedia.org{pageUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        #mw-parser-output
        bodyContent = bs.find('div', {'id':'bodyContent'}).find_all('p')
        if len(bodyContent):
            print(bodyContent[0])
        print(bs.find(id='ca-edit').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')

    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('/wiki/Lebanon')

Lebanon
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Main_Page
Main Page
<p><i><b><a href="/wiki/Atrociraptor" title="Atrociraptor">Atrociraptor</a></b></i> is a <a href="/wiki/Genus" title="Genus">genus</a> of <a href="/wiki/Dromaeosauridae" title="Dromaeosauridae">dromaeosaurid</a> dinosaur that lived during the <a href="/wiki/Late_Cretaceous" title="Late Cretaceous">Late Cretaceous</a> in what is now <a href="/wiki/Alberta" title="Alberta">Alberta</a>, Canada. The first specimen was discovered in 1995 by the <a href="/wiki/Fossil_collecting" title="Fossil collecting">fossil collector</a> Wayne Marshall in the <a href="/wiki/Horseshoe_Canyon_Formation" title="Horseshoe Canyon Formation">Horseshoe Canyon Formation</a>. In 2004, this became the <a href="/wiki/Holotype" title="Holotype">holotype</a> of the new genus and species <i>Atrociraptor marshalli</i>; the <a href="/wiki/Genus#Use" title="Genus">generic name</a> is <a href="/

HTTPError: HTTP Error 404: Not Found

## Crawling across the Internet

In [191]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random


#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, url):
    netloc = urlparse(url).netloc
    scheme = urlparse(url).scheme
    internalLinks = set()
    for link in bs.find_all('a'):
        if not link.attrs.get('href'):
            continue
        parsed = urlparse(link.attrs['href'])
        if parsed.netloc == '':
            internalLinks.add(f'{scheme}://{netloc}/{link.attrs["href"].strip("/")}')
        elif parsed.netloc == netloc:
            internalLinks.add(link.attrs['href'])
    return list(internalLinks)

#Retrieves a list of all external links found on a page
def getExternalLinks(bs, url):
    netloc = urlparse(url).netloc
    externalLinks = set()
    for link in bs.find_all('a'):
        if not link.attrs.get('href'):
            continue
        parsed = urlparse(link.attrs['href'])
        if parsed.netloc != '' and parsed.netloc != netloc:
            externalLinks.add(link.attrs['href'])
    return list(externalLinks)

def getRandomExternalLink(startingPage):
    bs = BeautifulSoup(urlopen(startingPage), 'html.parser')
    externalLinks = getExternalLinks(bs, startingPage)
    if not len(externalLinks):
        print('No external links, looking around the site for one')
        internalLinks = getInternalLinks(bs, startingPage)
        return getRandomExternalLink(random.choice(internalLinks))
    else:
        return random.choice(externalLinks)

def followExternalOnly(startingSite, depth=0, max_depth=3):
    if depth > max_depth:
        return
    externalLink = getRandomExternalLink(startingSite)
    print(f'Random external link is: {externalLink}')
    followExternalOnly(externalLink, depth + 1, max_depth)


followExternalOnly('https://www.oreilly.com/')


Random external link is: https://learning.oreilly.com/search/?query=author%3A%22Neal%20Ford%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false
Random external link is: https://www.oreilly.co.jp/index.shtml
Random external link is: https://makezine.jp/blog/2024/07/mft2024_posterandflyer.html
Random external link is: https://www.oreilly.co.jp/


## Collect all External Links from a Site

In [189]:
# Collects a list of all external URLs found on the site
allExtLinks = []
allIntLinks = []


def getAllExternalLinks(url):
    bs = BeautifulSoup(urlopen(url), 'html.parser')
    internalLinks = getInternalLinks(bs, url)
    externalLinks = getExternalLinks(bs, url)
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.append(link)
            print(link)

    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.append(link)
            getAllExternalLinks(link)


allIntLinks.append('https://oreilly.com')
getAllExternalLinks('https://www.oreilly.com/')

https://learning.oreilly.com/search/?query=author%3A%22Sari%20Greene%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false
https://www.oreilly.co.jp/index.shtml
https://www.linkedin.com/company/oreilly-media
https://oreilly.id/
https://learning.oreilly.com/search/?query=author%3A%22Bruno%20Gon%C3%A7alves%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false
https://www.amazon.c

HTTPError: HTTP Error 404: Not Found