# Web Scraping

First, you should check the `robots.txt` file to see if you're allowed to do scrape them, though it's hard to enforce.

imports

In [120]:
import requests
from bs4 import BeautifulSoup

Use `requests` to get the raw html.

In [121]:
website_url = 'https://news.ycombinator.com/news'
raw_html = requests.get(website_url).text
raw_html

'<html lang="en" op="news"><head><meta name="referrer" content="origin"><meta name="viewport" content="width=device-width, initial-scale=1.0"><link rel="stylesheet" type="text/css" href="news.css?PnDm2YIo4rt9O6DeyUbm">\n        <link rel="icon" href="y18.svg">\n                  <link rel="alternate" type="application/rss+xml" title="RSS" href="rss">\n        <title>Hacker News</title></head><body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">\n        <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img src="y18.svg" width="18" height="18" style="border:1px white solid; display:block"></a></td>\n                  <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>\n                            <a href="newest">new</a> | <a hr

Use `BeautifulSoup` to parse the raw html into something readable.

In [122]:
soup = BeautifulSoup(raw_html, 'html.parser')

In [123]:
# everything between the body tags
soup.body

<body><center><table bgcolor="#f6f6ef" border="0" cellpadding="0" cellspacing="0" id="hnmain" width="85%">
<tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a></td>
<td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
<a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a href="show">show</a> | <a href="jobs">jobs</a> | <a href="submit" rel="nofollow">submit</a> </span></td><td style="text-align:right;padding-right:4px;"><span class="pagetop">
<a href="login?goto=news">login</a>
</span></td>
</tr></table></td></tr>
<tr id="pagespace" style="height:10px" title=""></tr><tr><td><table border="0" cellpadding="0" cellspacing="0">
<tr c

In [124]:
# gives all lines in list format
soup.contents

[<html lang="en" op="news"><head><meta content="origin" name="referrer"/><meta content="width=device-width, initial-scale=1.0" name="viewport"/><link href="news.css?PnDm2YIo4rt9O6DeyUbm" rel="stylesheet" type="text/css"/>
 <link href="y18.svg" rel="icon"/>
 <link href="rss" rel="alternate" title="RSS" type="application/rss+xml"/>
 <title>Hacker News</title></head><body><center><table bgcolor="#f6f6ef" border="0" cellpadding="0" cellspacing="0" id="hnmain" width="85%">
 <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a></td>
 <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
 <a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> |

In [125]:
# gets all a links
soup.find_all('a')

[<a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a>,
 <a href="news">Hacker News</a>,
 <a href="newest">new</a>,
 <a href="front">past</a>,
 <a href="newcomments">comments</a>,
 <a href="ask">ask</a>,
 <a href="show">show</a>,
 <a href="jobs">jobs</a>,
 <a href="submit" rel="nofollow">submit</a>,
 <a href="login?goto=news">login</a>,
 <a href="vote?id=42919502&amp;how=up&amp;goto=news" id="up_42919502"><div class="votearrow" title="upvote"></div></a>,
 <a href="item?id=42919502">Ask HN: Who is hiring? (February 2025)</a>,
 <a class="hnuser" href="user?id=whoishiring">whoishiring</a>,
 <a href="item?id=42919502">3 hours ago</a>,
 <a href="hide?id=42919502&amp;goto=news">hide</a>,
 <a href="item?id=42919502">167 comments</a>,
 <a href="vote?id=42919909&amp;how=up&amp;goto=news" id="up_42919909"><div class="votearrow" title="upvote"></div></a>,
 <a href="https://github.com/monasticacademy/httptap">Httptap: V

In [126]:
# finds the first item
soup.find('a')

<a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a>

### Getting Our Data
Our goal is to get all of the articles on the page where the score > 100.

Select allows us to use CSS Selectors

In [127]:
# use .score to get anything in the score class
soup.select('.score')

[<span class="score" id="score_42919502">197 points</span>,
 <span class="score" id="score_42919909">151 points</span>,
 <span class="score" id="score_42920962">39 points</span>,
 <span class="score" id="score_42918524">182 points</span>,
 <span class="score" id="score_42919597">87 points</span>,
 <span class="score" id="score_42920285">72 points</span>,
 <span class="score" id="score_42918902">191 points</span>,
 <span class="score" id="score_42921659">9 points</span>,
 <span class="score" id="score_42919500">31 points</span>,
 <span class="score" id="score_42918644">77 points</span>,
 <span class="score" id="score_42920119">34 points</span>,
 <span class="score" id="score_42918846">90 points</span>,
 <span class="score" id="score_42920921">28 points</span>,
 <span class="score" id="score_42920657">26 points</span>,
 <span class="score" id="score_42919221">80 points</span>,
 <span class="score" id="score_42918237">74 points</span>,
 <span class="score" id="score_42894939">11 points</s

In [128]:
# to gett all titles
soup.select('.titleline')

[<span class="titleline"><a href="item?id=42919502">Ask HN: Who is hiring? (February 2025)</a></span>,
 <span class="titleline"><a href="https://github.com/monasticacademy/httptap">Httptap: View HTTP/HTTPS requests made by any Linux program</a><span class="sitebit comhead"> (<a href="from?site=github.com/monasticacademy"><span class="sitestr">github.com/monasticacademy</span></a>)</span></span>,
 <span class="titleline"><a href="https://shalzuth.com/Blog/IFoundAGameExploit">Remote Code Execution in Marvel Rivals Game</a><span class="sitebit comhead"> (<a href="from?site=shalzuth.com"><span class="sitestr">shalzuth.com</span></a>)</span></span>,
 <span class="titleline"><a href="https://irreducible.io/blog/my-wasm-interpreter/">I Wrote a WebAssembly VM in C</a><span class="sitebit comhead"> (<a href="from?site=irreducible.io"><span class="sitestr">irreducible.io</span></a>)</span></span>,
 <span class="titleline"><a href="https://arxiv.org/abs/2501.19201">Efficient Reasoning with Hidden

In [129]:
# just the first one
soup.select('.titleline')[0]

<span class="titleline"><a href="item?id=42919502">Ask HN: Who is hiring? (February 2025)</a></span>

### Marking our lists

Let's do everything from scratch again.

In [130]:
website_url = 'https://news.ycombinator.com/news'
raw_html = requests.get(website_url).text
soup = BeautifulSoup(raw_html, 'html.parser')
links = soup.select('.titleline')
subtext = soup.select('.subtext')

In [147]:
links[1]

<span class="titleline"><a href="https://github.com/monasticacademy/httptap">Httptap: View HTTP/HTTPS requests made by any Linux program</a><span class="sitebit comhead"> (<a href="from?site=github.com/monasticacademy"><span class="sitestr">github.com/monasticacademy</span></a>)</span></span>

In [148]:
links[1].getText()

'Httptap: View HTTP/HTTPS requests made by any Linux program (github.com/monasticacademy)'

In [132]:
links[1].a.get('href', None)

'https://github.com/monasticacademy/httptap'

In [145]:
int(subtext[1].select_one('.score').getText().replace(' points', ''))

151

In [165]:
def create_custom_hn(links, subtext):
    hn = []
        
    # go through links and subtext
    for index, item in enumerate(links):
        title = links[index].getText() # extract the title
        href = links[index].a.get('href', None) #extract the link
        vote = subtext[index].select_one('.score') #extract the vote
        
        if vote: # check if vote is not None
            points = int(vote.getText().replace(' points', '')) # extract the points
            if points >= 100: # check if points is greater than 100
                hn.append({'title': title, 'link': href, 'points': points}) # append the title, link and points to the dictionary

    # sort the dictionary by points
    return sorted(hn, key = lambda k:k['points'], reverse = True)

In [171]:
import pprint
pprint.pprint(create_custom_hn(links, subtext))

[{'link': 'https://timsh.org/tracking-myself-down-through-in-app-ads/',
  'points': 1745,
  'title': 'Everyone knows your location: tracking myself down through in-app '
           'ads (timsh.org)'},
 {'link': 'https://openai.com/index/openai-o3-mini/',
  'points': 940,
  'title': 'OpenAI O3-Mini (openai.com)'},
 {'link': 'https://www.404media.co/archivists-work-to-identify-and-save-the-thousands-of-datasets-disappearing-from-data-gov/',
  'points': 791,
  'title': 'Archivists work to save disappearing data.gov datasets '
           '(404media.co)'},
 {'link': 'https://substack.com/home/post/p-156004330',
  'points': 740,
  'title': 'Bypass DeepSeek censorship by speaking in hex (substack.com)'},
 {'link': 'https://www.theatlantic.com/health/archive/2025/01/cdc-dei-scientific-data/681531/',
  'points': 731,
  'title': 'CDC data are disappearing (theatlantic.com)'},
 {'link': 'https://lumon-industries.com/',
  'points': 690,
  'title': 'Macrodata Refinement (lumon-industries.com)'},
 {

## Retreiving Multiple Pages
Noting that the website has the stem `'https://news.ycombinator.com/news?p='` we can use a for loop to add on the page at the end and retreive that data.

In [172]:
url_stem = 'https://news.ycombinator.com/news?p='
num_pages = 5

# intialize the lists
links = []
subtext = []

# go through all pages
for page in range(1, num_pages + 1):
    url = f'{url_stem}{page}'
    raw_html = requests.get(url).text
    soup = BeautifulSoup(raw_html, 'html.parser')
    links.extend(soup.select('.titleline'))
    subtext.extend(soup.select('.subtext'))


def create_custom_hn(links, subtext):
    hn = []
        
    # go through links and subtext
    for index, item in enumerate(links):
        title = links[index].getText() # extract the title
        href = links[index].a.get('href', None) #extract the link
        vote = subtext[index].select_one('.score') #extract the vote
        
        if vote: # check if vote is not None
            points = int(vote.getText().replace(' points', '')) # extract the points
            if points >= 100: # check if points is greater than 100
                hn.append({'title': title, 'link': href, 'points': points}) # append the title, link and points to the dictionary

    # sort the dictionary by points
    return sorted(hn, key = lambda k:k['points'], reverse = True)

create_custom_hn(links, subtext)

[{'title': 'Everyone knows your location: tracking myself down through in-app ads (timsh.org)',
  'link': 'https://timsh.org/tracking-myself-down-through-in-app-ads/',
  'points': 1745},
 {'title': 'OpenAI O3-Mini (openai.com)',
  'link': 'https://openai.com/index/openai-o3-mini/',
  'points': 940},
 {'title': 'Archivists work to save disappearing data.gov datasets (404media.co)',
  'link': 'https://www.404media.co/archivists-work-to-identify-and-save-the-thousands-of-datasets-disappearing-from-data-gov/',
  'points': 791},
 {'title': 'Bypass DeepSeek censorship by speaking in hex (substack.com)',
  'link': 'https://substack.com/home/post/p-156004330',
  'points': 740},
 {'title': 'CDC data are disappearing (theatlantic.com)',
  'link': 'https://www.theatlantic.com/health/archive/2025/01/cdc-dei-scientific-data/681531/',
  'points': 731},
 {'title': 'Macrodata Refinement (lumon-industries.com)',
  'link': 'https://lumon-industries.com/',
  'points': 690},
 {'title': "String of recent k

# Scrapy
Rememebr, you can use `scrapy` to do massive websites as it has more features, such as web crawling.