# Scraper: Beer Recipes

python web scaper for [Brewer's Friend](https://www.brewersfriend.com/search/)

In [1]:
from bs4 import BeautifulSoup
from requests.structures import CaseInsensitiveDict
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import random
import re
import requests
import time


nest_asyncio.apply()

In [2]:
MAIN_URL = "https://www.brewersfriend.com/search/index.php"


def make_headers() -> dict[str, str]:
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
    ]
    headers = CaseInsensitiveDict()
    headers["Content-Type"] = "application/x-www-form-urlencoded"
    headers["User-Agent"] = random.choice(user_agent_list)
    return headers


def soupify(html: str) -> BeautifulSoup:
    return BeautifulSoup(html, "html.parser")


def init_max_page_number() -> int:
    response = requests.post(MAIN_URL, headers=make_headers())
    assert response.status_code == 200, f"status code: {response.status_code}"
    html = response.text
    soup = soupify(html)
    last_page = soup.select_one("#navrow > td.right > ul > li:nth-child(1) > a")
    last_page = last_page.text.strip()
    last_page = last_page.replace(",", "")
    last_page = last_page.rsplit(" ", 1)[-1]
    last_page = int(last_page)
    return last_page


async def get_page_html(page_number: int, session: aiohttp.ClientSession) -> str:
    assert 1 <= page_number <= MAX_PAGE_NUMBER, "Page number out of bounds!"
    headers = make_headers()
    data = f"units=metric&page={page_number}"
    html = await make_post_request(
        MAIN_URL, headers=headers, data=data, session=session
    )
    return html


def parse_links(html: str) -> pd.DataFrame:
    links = []
    soup = soupify(html)
    for link in soup.find_all("a", {"class": "recipetitle", "href": True}):
        links.append(link["href"])
    ids = [re.search(r"view/(\d+)/", link).group(1) for link in links]
    return pd.DataFrame({"links": links, "ids": ids})


def parse_metadata(html: str) -> pd.DataFrame:
    result = []
    for df in pd.read_html(html):
        # ugly af but it works
        if len(df) != 4:
            continue
        data: list[str] = (
            df.loc[1].tolist() + df.loc[2].tolist() + df.iloc[[0, 3], 0].tolist()
        )
        curr = dict()
        for entry in data:
            k, v = entry.split(":", 1)
            curr[k] = v.strip()
        result.append(curr)
    return pd.DataFrame(result)


async def make_get_request(url: str, session: aiohttp.ClientSession) -> str | None:
    async with session.get(url) as response:
        status_code = response.status
        if status_code == 200:
            html = await response.text()
            return html
        else:
            print(f"error: {status_code=} for {url=}")
            return


async def make_post_request(
    url: str,
    headers: dict[str, str],
    data: str,
    session: aiohttp.ClientSession,
) -> str | None:
    async with session.post(url, headers=headers, data=data) as response:
        status_code = response.status
        if status_code == 200:
            html = await response.text()
            return html
        else:
            print(f"error: {status_code=} for {url=}")
            return


async def test() -> str:
    page_number = random.choice(range(MAX_PAGE_NUMBER))
    print("getting page number", page_number)
    async with aiohttp.ClientSession() as session:
        html = await get_page_html(page_number, session)
        print("Body:", html[:100], "...")
    return html


# MAX_PAGE_NUMBER = init_max_page_number()
# MAX_PAGE_NUMBER
MAX_PAGE_NUMBER = 37  # debug
html = asyncio.run(test())

getting page number 5
Body: <thead>
<tr>
            <th class="thtitle" data-sort-option="title"
            data-sort-directio ...


In [3]:
async def fake(x: str, q: asyncio.Queue):
    await asyncio.sleep(1)
    if random.random() < 0.1:
        q.put_nowait("page=X")
    print(x, end=" ")


async def test(q: asyncio.Queue):
    for _ in range(MAX_PAGE_NUMBER // 10):
        urls = [q.get_nowait() for _ in range(10)]
        await asyncio.gather(*(fake(url, q) for url in urls))
        print()
    remainder = q.qsize()
    urls = [q.get_nowait() for _ in range(remainder)]
    await asyncio.gather(*(fake(url, q) for url in urls))
    print()


queue = asyncio.Queue()
for i in range(MAX_PAGE_NUMBER):
    queue.put_nowait(f"page={i+1}")

tic = time.time()
asyncio.run(test(q=queue))
round(time.time() - tic, 3)

page=1 page=2 page=3 page=4 page=5 page=6 page=7 page=8 page=9 page=10 
page=11 page=12 page=13 page=14 page=15 page=16 page=17 page=18 page=19 page=20 
page=21 page=22 page=23 page=24 page=25 page=26 page=27 page=28 page=29 page=30 
page=31 page=32 page=33 page=34 page=35 page=36 page=37 page=X page=X page=X 


4.015

In [4]:
links = parse_links(html)

print("links found:", len(links), ", last 3:")
print(links[-3:])

links found: 20 , last 3:
                                                links     ids
17  /homebrew/recipe/view/476339/einstok-olgerd-wh...  476339
18  /homebrew/recipe/view/415033/shepherd-neame-sp...  415033
19  /homebrew/recipe/view/90044/bakke-brygg-americ...   90044


In [5]:
parse_metadata(html)

Unnamed: 0,Boil Size,Boil Time,Boil Gravity,Efficiency,Mash Thickness,Sugar Scale,Brew Method,Pitch Rate,Primary Temp,Priming Method,Priming Amount,Creation Date,Author,Notes
0,6 Litres,90,1.044,77,,Specific Gravity,All Grain,0.75,18 ° C,Sukkerlake,"6,5 g sukker/L",10/12/2013 12:17 PM,Bakke Brygg,Mengden meske- og skyllevann du bør bruke komm...
1,700 Litres,240,1.122,40,,Specific Gravity,All Grain,0.35,16 ° C,,,7/5/2017 10:06 AM,thehaze,50
2,29 Litres,90,1.046,77,,Specific Gravity,All Grain,0.75,18 ° C,Sukkerlake,"6,5 g sukker/L",2/8/2014 1:10 PM,Bakke Brygg,Mengden meske- og skyllevann du bør bruke komm...
3,28.5 Litres,60,1.043,65,3.0,Specific Gravity,All Grain,,20 ° C,,,9/25/2017 2:36 AM,Aussie Brewer Blog,
4,28.5 Litres,75,1.038,60,2.75,Specific Gravity,All Grain,0.75,17 ° C,,,3/21/2016 5:17 PM,Rodrigo Wantuk,"Manter a cerveja o mais SECA possivel, fazer c..."
5,28 Litres,60,1.059,70,,Specific Gravity,All Grain,,18 ° C,CO2,,9/17/2015 7:21 AM,tnesser,
6,54 Litres,60,1.043,68,2.5,Specific Gravity,All Grain,,12 ° C,,,2/24/2017 1:30 PM,4runner,Start gjæringen på 9-10 grader og la heve til ...
7,700 Litres,75,1.044,72,,Specific Gravity,All Grain,0.35,20 ° C,,,7/5/2017 9:58 AM,thehaze,
8,700 Litres,180,1.098,55,,Specific Gravity,All Grain,0.35,18 ° C,,,7/5/2017 9:43 AM,thehaze,
9,28 Litres,90,1.037,75,6.097,Specific Gravity,All Grain,0.35,20 ° C,,,5/27/2013 9:28 PM,Toombstone,Brewed using the 20L Braumeister.


https://docs.scrapy.org/en/latest/intro/tutorial.html
https://docs.scrapy.org/en/latest/topics/dynamic-content.html
https://reqbin.com/

# TODO
- complete async methods
- use a work queue
  - if the queue grows dynamically it must process max *n* links at the same time
- save results on FS
- fix beer xml functions

In [None]:
def get_beerxml_url(html) -> str:
    for a in html.find_all("a", href=True):
        href = a["href"]
        if "/beerxml" in href:
            return href


# build BeerXml from id
url = "https://www.brewersfriend.com/homebrew/recipe/downloadbeerxml/{id}".format(
    id=ids[0]
)
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
response = requests.get(url, headers=headers)
soup = soupify(response.text)

print(soup.title.text, get_beerxml_url(soup))

Avg. Perfect Northeast IPA (NEIPA) - Beer Recipe | Brewer's Friend https://www.brewersfriend.com/homebrew/recipe/beerxml1.0/363082


In [None]:
url = "https://www.brewersfriend.com/homebrew/recipe/beerxml1.0/363082"
response = requests.get(url, headers=headers)

import xml.dom.minidom

dom = xml.dom.minidom.parseString(response.text)
pretty_xml_as_string = dom.toprettyxml()

print(pretty_xml_as_string.split("\n")[:10])

['<?xml version="1.0" ?>', '<RECIPES>', '\t', ' ', '\t<RECIPE>', '\t\t', '  ', '\t\t<NAME>Avg. Perfect Northeast IPA (NEIPA)</NAME>', '\t\t', '  ']
