# Scraper: Beer Recipes

python web scaper for [Brewer's Friend](https://www.brewersfriend.com/search/)

In [1]:
from bs4 import BeautifulSoup
from requests.structures import CaseInsensitiveDict
from typing import Callable
from xml.dom.minidom import parseString
import aiohttp
import asyncio
import itertools
import nest_asyncio
import pandas as pd
import random
import re
import requests
import time

nest_asyncio.apply()

MAIN_URL = "https://www.brewersfriend.com/search/index.php"


def make_headers() -> dict[str, str]:
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
    ]
    headers = CaseInsensitiveDict()
    headers["Content-Type"] = "application/x-www-form-urlencoded"
    headers["User-Agent"] = random.choice(user_agent_list)
    # print("User Agent:", headers["User-Agent"])  # debug
    return headers


def soupify(html: str) -> BeautifulSoup:
    return BeautifulSoup(html, "html.parser")


async def test() -> str:
    page_number = random.choice(range(MAX_PAGE_NUMBER))
    print("getting page number", page_number)
    async with aiohttp.ClientSession() as session:
        html = await get_page_html(page_number, session)
        print("Body:", html[:100], "...")
    return html


async def init_max_page_number() -> int:
    async with aiohttp.ClientSession() as session:
        html = await get_page_html(1, session)
    with open("./data/index.html", "w") as fp:
        fp.write(html)
    soup = soupify(html)
    last_page = soup.select_one("#navrow > td.right > ul > li:nth-child(1) > a")
    last_page = last_page.text.strip()
    last_page = last_page.replace(",", "")
    last_page = last_page.rsplit(" ", 1)[-1]
    last_page = int(last_page)
    return last_page


async def get_page_html(page_number: int, session: aiohttp.ClientSession) -> str:
    assert 1 <= page_number <= MAX_PAGE_NUMBER, "Page number out of bounds!"
    headers = make_headers()
    data = make_data(page_number)
    html = await make_post_request(
        MAIN_URL, headers=headers, data=data, session=session
    )
    return html


def make_data(page_number: int = 1):
    return f"units=metric&page={page_number}"


def parse_links(html: str) -> pd.DataFrame:
    links = []
    soup = soupify(html)
    for link in soup.find_all("a", {"class": "recipetitle", "href": True}):
        links.append(link["href"])
    ids = [re.search(r"view/(\d+)/", link).group(1) for link in links]
    return pd.DataFrame({"links": links, "ids": ids})


def parse_metadata(html: str) -> pd.DataFrame:
    result = []
    for df in pd.read_html(html):
        # ugly af but it works
        if len(df) != 4:
            continue
        data: list[str] = (
            df.loc[1].tolist() + df.loc[2].tolist() + df.iloc[[0, 3], 0].tolist()
        )
        curr = dict()
        for entry in data:
            k, v = entry.split(":", 1)
            curr[k] = v.strip()
        result.append(curr)
    return pd.DataFrame(result)


async def make_get_request(url: str, session: aiohttp.ClientSession) -> str | None:
    async with session.get(url) as response:
        status_code = response.status
        if status_code == 200:
            html = await response.text()
            return html
        else:
            print(f"error: {status_code=} for {url=}")
            return


async def make_post_request(
    url: str,
    headers: dict[str, str],
    data: str,
    session: aiohttp.ClientSession,
) -> str | None:
    async with session.post(url, headers=headers, data=data) as response:
        status_code = response.status
        if status_code == 200:
            html = await response.text()
            return html
        else:
            print(f"error: {status_code=} for {url=}")
            return


async def test() -> str:
    page_number = random.choice(range(MAX_PAGE_NUMBER))
    print("getting page number", page_number)
    async with aiohttp.ClientSession() as session:
        html = await get_page_html(page_number, session)
        print("Body:", html[:100], "...")
    return html


MAX_PAGE_NUMBER = 37  # debug
MAX_PAGE_NUMBER = asyncio.run(init_max_page_number())
print(MAX_PAGE_NUMBER)
soup = asyncio.run(test())

5678
getting page number 57
Body: <thead>
<tr>
            <th class="thtitle" data-sort-option="title"
            data-sort-directio ...


In [2]:
def make_page_groups(
    max_page_number: int = MAX_PAGE_NUMBER, pages_per_group: int = 10
) -> list[list[int]]:
    page_groups = []
    page_numbers = range(1, max_page_number + 1)
    for _, g in itertools.groupby(page_numbers, lambda x: x // pages_per_group):
        page_groups.append(list(g))
    return page_groups


def wait_time_gen() -> int:
    return random.choices(range(1, 10), weights=[1, 2, 3, 4, 5, 4, 3, 2, 1], k=1)[0]


def consume_groups(
    page_groups: list[list[int]], consume_page_fn: Callable, wait_time_gen_fn: Callable
) -> None:
    i = 0
    while page_groups:
        current_pages = page_groups.pop(0)
        for p in current_pages:
            consume_page_fn(p)
        i += 1
        print("group", i, "consumed!")
        sleep_time = wait_time_gen_fn()
        print("sleeping for", sleep_time, "s...")
        time.sleep(sleep_time)


groups = make_page_groups(7, 3)
print("number of page groups:", len(groups))
tic = time.time()
consume_groups(groups, lambda p: print(p, end=" "), lambda: 1)
print("elapsed", round(time.time() - tic, 3), "s")

number of page groups: 3
1 2 group 1 consumed!
sleeping for 1 s...
3 4 5 group 2 consumed!
sleeping for 1 s...
6 7 group 3 consumed!
sleeping for 1 s...
elapsed 3.003 s


In [3]:
links = parse_links(soup)

print("links found:", len(links), ", last 3:")
print(links[-3:])

links found: 20 , last 3:
                                                links     ids
17            /homebrew/recipe/view/242488/tropic-ipa  242488
18    /homebrew/recipe/view/491275/cherry-belgian-ale  491275
19  /homebrew/recipe/view/463686/imperial-ipa-vixn...  463686


In [4]:
parse_metadata(soup)

Unnamed: 0,Boil Size,Boil Time,Boil Gravity,Efficiency,Mash Thickness,Sugar Scale,Brew Method,Pitch Rate,Primary Temp,Priming Method,Priming Amount,Creation Date,Author,Notes
0,11 Litres,20,1.08,35,,Specific Gravity,Extract,0.75,12 ° C,,,1/19/2016 12:22 PM,M&M,DON'T use M84. I had to do unspeakable things ...
1,30 Litres,90,1.049,80,3.0,Specific Gravity,All Grain,0.35,12 ° C,,,3/2/2017 1:40 PM,Gulating Tromsø,Meskes på angitt tid og temperatur. Deretter ø...
2,26.25 Litres,60,1.038,75,3.0,Specific Gravity,All Grain,0.75,20 ° C,co2,0.2 bar,9/7/2017 10:57 AM,Brafina,10 days @20 c 4days @1 c Kegg and lager 1 wee...
3,8 Litres,60,,75,,Specific Gravity,Partial Mash,,,,,3/25/2013 8:47 AM,Ohad,
4,13 Litres,60,1.137,35,,Specific Gravity,Extract,,,,,11/18/2015 4:17 PM,,Inspired by Kraken Meadery's Loud Mouth Braggo...
5,14 Litres,60,1.043,70,,Specific Gravity,BIAB,,,,,11/1/2015 9:43 AM,mercurial,We have used new Crisp grains for this one. Al...
6,24 Litres,90,1.042,70,,Specific Gravity,BIAB,,10 ° C,,,3/25/2014 10:38 AM,Bakke Brygg,
7,28 Litres,90,1.048,72,,Specific Gravity,All Grain,,10 ° C,,,2/5/2014 9:57 AM,Bakke Brygg,
8,11 Litres,60,,75,,Specific Gravity,Partial Mash,,,,,1/17/2012 12:09 PM,twistah,test
9,29 Litres,60,1.064,68,,Specific Gravity,All Grain,,19 ° C,Sukkerlake,5 g sukker/l,6/15/2015 7:01 AM,Bakke Brygg,Kjøl ned vørter til 18 grader før pitching av ...


https://docs.scrapy.org/en/latest/intro/tutorial.html
https://docs.scrapy.org/en/latest/topics/dynamic-content.html
https://reqbin.com/

# TODO
- complete async methods
- use a work queue
  - if the queue grows dynamically it must process max *n* links at the same time
- save results on FS
- fix beer xml functions

In [5]:
def get_beerxml_url(soup) -> str:
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "/beerxml" in href:
            return href


# build BeerXml from id
beerxml_base_url = "https://www.brewersfriend.com/homebrew/recipe/downloadbeerxml/{id}"

print("Beer:")
response = requests.get(beerxml_base_url.format(id=363082), headers=make_headers())
html = response.text
soup = soupify(html)

beerxml_url = get_beerxml_url(soup)
print(soup.title.text, beerxml_url)

response = requests.get(beerxml_url, headers=make_headers())
html = response.text
dom = parseString(html)
pretty_xml_as_string = dom.toprettyxml()

print("XML:")
for line in pretty_xml_as_string.split("\n")[:25]:
    if len(line.strip()):
        print(line)

Beer:
Avg. Perfect Northeast IPA (NEIPA) - Beer Recipe | Brewer's Friend https://www.brewersfriend.com/homebrew/recipe/beerxml1.0/363082
XML:
<?xml version="1.0" ?>
<RECIPES>
	<RECIPE>
		<NAME>Avg. Perfect Northeast IPA (NEIPA)</NAME>
		<VERSION>1</VERSION>
		<TYPE>All Grain</TYPE>
		<BREWER>Kevin Quinn - (Beer Advocate Crowd-Sourced Recipe)</BREWER>
		<DISPLAY_BATCH_SIZE>5.75 gal</DISPLAY_BATCH_SIZE>
		<DISPLAY_BOIL_SIZE>7.5 gal</DISPLAY_BOIL_SIZE>
