# Scraper: Beer Recipes

python web scaper for [Brewer's Friend](https://www.brewersfriend.com/search/)

In [2]:
from bs4 import BeautifulSoup
from requests.structures import CaseInsensitiveDict
import aiohttp
import asyncio
import csv
import json
import nest_asyncio
import pandas as pd
import re
import requests
import time


nest_asyncio.apply()

In [None]:
MAX_PAGE_NUMBER = 1
url = "https://www.brewersfriend.com/search/index.php"


def soupy(response_text: str) -> BeautifulSoup:
    return BeautifulSoup(response_text, "html.parser")


def get_last_page_number(response_text: str) -> int:
    soup = soupy(response_text)
    last_page = soup.select_one("#navrow > td.right > ul > li:nth-child(1) > a")
    last_page = last_page.text.strip()
    last_page = last_page.replace(",", "")
    last_page = last_page.rsplit(" ", 1)[-1]
    last_page = int(last_page)
    return last_page


def get_html_response(page_number: int = 1) -> requests.Response:
    assert 1 <= page_number <= MAX_PAGE_NUMBER, "Page number out of bounds!"
    headers = CaseInsensitiveDict()
    headers["Content-Type"] = "application/x-www-form-urlencoded"
    headers[
        "User-Agent"
    ] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
    data = f"units=metric&page={page_number}"
    response = requests.post(url, headers=headers, data=data)
    assert response.status_code == 200, "Response not OK!"
    return response


def parse_links(response_text: str) -> pd.DataFrame:
    links = []
    soup = soupy(response_text)
    for link in soup.find_all("a", {"class": "recipetitle", "href": True}):
        links.append(link["href"])
    ids = [re.search(r"view/(\d+)/", link).group(1) for link in links]
    return pd.DataFrame({"links": links, "ids": ids})


def parse_metadata(response_text: str) -> pd.DataFrame:
    result = []
    for df in pd.read_html(response_text):
        # ugly af but it works
        if len(df) != 4:
            continue
        data = df.loc[1].tolist() + df.loc[2].tolist() + df.iloc[[0, 3], 0].tolist()
        curr = dict()
        for entry in data:
            k, v = entry.split(":", 1)
            curr[k] = v.strip()
        result.append(curr)
    return pd.DataFrame(result)


response = get_html_response()
MAX_PAGE_NUMBER = get_last_page_number(response.text)
MAX_PAGE_NUMBER

5668

# TODO
- chiamate parallele asincrone

In [None]:
response = get_html_response(100)

In [None]:
links = parse_links(response.text)

print("links found:", len(links), ", last 3:")
print(links[-3:])

links found: 20 , last 3:
                                                links     ids
17  /homebrew/recipe/view/256343/black-apricot-fud...  256343
18  /homebrew/recipe/view/103985/imperial-coffee-s...  103985
19       /homebrew/recipe/view/491642/blueberry-wheat  491642


In [None]:
parse_metadata(response.text)

Unnamed: 0,Boil Size,Boil Time,Boil Gravity,Efficiency,Mash Thickness,Sugar Scale,Brew Method,Pitch Rate,Primary Temp,Priming Method,Priming Amount,Creation Date,Author,Notes
0,40 Litres,75,1.035,50.0,,Specific Gravity,BIAB,,22 ° C,,,5/17/2018 11:49 AM,Nick Mc,"Setup done in garage, under cover, no wind, am..."
1,27 Litres,60,1.031,72.0,4.0,Specific Gravity,All Grain,0.35,19 ° C,,,11/10/2018 10:44 AM,BAAS,
2,31.5 Litres,60,1.048,70.0,3.0,Specific Gravity,All Grain,,22 ° C,,,3/30/2016 9:41 AM,Rompetom,
3,25 Litres,60,1.049,72.0,2.5,Specific Gravity,All Grain,,21 ° C,,,12/10/2013 1:31 AM,walrusdunne,Won IPA category 3rd place in UK Homebrew Comp...
4,1125 Litres,80,1.044,80.0,3.0,Specific Gravity,All Grain,1.75,11 ° C,,,8/20/2015 8:21 PM,4 Mile Brews,
5,29 Litres,70,1.048,74.0,3.0,Specific Gravity,All Grain,0.75,20 ° C,,,7/22/2019 11:03 AM,pughj,
6,83 Litres,60,1.04,75.0,4.35,Specific Gravity,All Grain,,10 ° C,,,9/7/2016 1:55 PM,Strasak,Final Gravity: 1016 SRM (real): 22 storage: 1-...
7,28 Litres,60,1.058,70.0,,Specific Gravity,All Grain,,18 ° C,,,10/7/2017 12:58 PM,Bakke Brygg,
8,28.5 Litres,60,1.035,70.0,2.5,Specific Gravity,All Grain,0.75,18 ° C,,,1/22/2018 5:09 AM,,http://beersmithrecipes.com/viewrecipe/1328260...
9,25 Litres,60,1.05,75.0,,Specific Gravity,BIAB,0.75,18 ° C,co2,,12/31/2015 2:34 AM,AnteK,"BIAB method for Braumeister 20l, efficiency 75..."


https://docs.scrapy.org/en/latest/intro/tutorial.html
https://docs.scrapy.org/en/latest/topics/dynamic-content.html
https://reqbin.com/

**manca aggiustare le funzioni per scaricare il beerxml**

In [None]:
def get_beerxml_url(html) -> str:    
    for a in html.find_all("a", href=True):
        href = a["href"]
        if "/beerxml" in href:
            return href


# build BeerXml from id
url = "https://www.brewersfriend.com/homebrew/recipe/downloadbeerxml/{id}".format(id=ids[0])
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = soupy(response.text)

print(soup.title.text, get_beerxml_url(soup))

Avg. Perfect Northeast IPA (NEIPA) - Beer Recipe | Brewer's Friend https://www.brewersfriend.com/homebrew/recipe/beerxml1.0/363082


In [None]:
url = "https://www.brewersfriend.com/homebrew/recipe/beerxml1.0/363082"
response = requests.get(url, headers=headers)

import xml.dom.minidom

dom = xml.dom.minidom.parseString(response.text)
pretty_xml_as_string = dom.toprettyxml()

print(pretty_xml_as_string.split("\n")[:10])

['<?xml version="1.0" ?>', '<RECIPES>', '\t', ' ', '\t<RECIPE>', '\t\t', '  ', '\t\t<NAME>Avg. Perfect Northeast IPA (NEIPA)</NAME>', '\t\t', '  ']
