In [1]:
# The purpose of the following project is to get some insight on beer prices. 
# This notebook contains a parser whose main objecetive is to parse the data from "liquorama", an online brewery
# store from California. This scripot will extract information about their beer listings and obtain data regarding characteristics
# such as price, brand, reviews ETCETERA. It will store the resulting objects into a JSON dictionary that will be transformed
# in a pandas dataframe in 


#First it will import the required libraries in order to scrape the data succesfully

In [262]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup, SoupStrainer
from urllib.request import urlopen, Request

In [4]:
print(requests.get("https://www.liquorama.net/robots.txt").text)

User-agent: *
Disallow: /search.php
User-agent: *
Disallow: /*?_bc_fsnf=1*
Disallow: /*&_bc_fsnf=1*



In [None]:
#The connection to the website was succesful and allows for scraping

In [None]:
################################################

# Disallow: /?_bc_fsnf=1 — This blocks bots from following faceted search links and causing performance issues.

# Disallow: /&_bc_fsnf=1 — This blocks bots from following faceted search links and causing performance issues.

# Disallow: /search.php — This page handles searches from the search box on a store. Google has previously stated 
#                        that search results pages are not something they want in their index because it creates a poor 
#                        user experience (going from a search results page to another search results page instead of going 
#                        directly to the result).
#
#
# Source: Big Commerce (Help Center), "Understanding the Robots.txt file", 
# https://support.bigcommerce.com/s/article/Understanding-the-Robots-txt-File
################################################

In [None]:
# The purpose of this scraper is to get the general information of Liquorama's beer listing. The online store has a 21 page listing 
# of beers for different size, price, country of origin, reviews and type. 

# Firstly, we need the complete set of links in an object for further parsing.

In [302]:
only_divs = SoupStrainer("h4", attrs={'class':'card-title'})
    
def GetSoup(link):
        r = requests.get(link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text,'lxml', parse_only = only_divs)
    

In [303]:
def GetLinks(link):
    soup = GetSoup(link)
    for link in soup.find_all('a'):
        print(link.get('href'))

In [304]:
enlaces = []
for i in range(1,22):
    enlaces.append("https://www.liquorama.net/spirits/?sort=alphaasc&page=" + str(i))

In [305]:
def get_all_the_links(enlaces):
    all_links = []
    for enlace in enlaces:
        all_links.append(GetLinks(enlace))

In [306]:
get_all_the_links(enlaces)

https://www.liquorama.net/ri-1-straight-rye-whiskey-750ml.html
https://www.liquorama.net/1792-full-proof-single-barrel-select-no-2573-kentucky-straight-bourbon-whiskey-750ml.html
https://www.liquorama.net/1792-single-barrel-kentucky-straight-bourbon-whiskey-750ml.html
https://www.liquorama.net/1792-small-batch-kentucky-straight-bourbon-whiskey-750ml.html
https://www.liquorama.net/1800-anejo-tequila-750ml.html
https://www.liquorama.net/1800-anejo-tequila-750ml-etch.html
https://www.liquorama.net/1800-reposado-tequila-750ml-etch.html
https://www.liquorama.net/1800-reposado-tequila-750ml.html
https://www.liquorama.net/1800-silver-tequila-750ml.html
https://www.liquorama.net/1800-silver-tequila-750ml-etch.html
https://www.liquorama.net/2-gingers-blended-irish-whiskey-750ml.html
https://www.liquorama.net/360-vodka-double-chocolate-vodka-750ml.html
https://www.liquorama.net/360-vodka-georgia-peach-vodka-750ml.html
https://www.liquorama.net/44-north-mountain-huckleberry-flavored-vodka-750ml-e

https://www.liquorama.net/american-born-apple-pie-moonshine-750ml.html
https://www.liquorama.net/american-born-apple-whiskey-750ml.html
https://www.liquorama.net/american-born-dixie-moonshine-750ml.html
https://www.liquorama.net/american-born-peach-whiskey-750ml.html
https://www.liquorama.net/american-star-caviar-lime-flavored-vodka-375ml.html
https://www.liquorama.net/ancho-reyes-ancho-chile-liqueur-750ml.html
https://www.liquorama.net/ancho-reyes-verde-chile-liqueur-750ml.html
https://www.liquorama.net/andrew-quady-vya-extra-dry-vermouth-750ml.html
https://www.liquorama.net/andrew-quady-vya-sweet-vermouth-750ml.html
https://www.liquorama.net/andrew-quady-vya-whisper-dry-vermouth-750ml.html
https://www.liquorama.net/angels-envy-port-barrel-finished-kentucky-straight-bourbon-whiskey-750ml.html
https://www.liquorama.net/angeles-de-oro-blanco-tequila-750ml.html
https://www.liquorama.net/angeles-de-oro-reposado-tequila-750ml.html
https://www.liquorama.net/angostura-bitters-16oz.html
https

https://www.liquorama.net/bank-note-5-year-old-blended-scotch-whisky-750ml.html
https://www.liquorama.net/barberino-alma-toscana-brunello-montalcino-grappa-750ml.html
https://www.liquorama.net/barberino-alma-toscana-chianti-classico-riserva-grappa-750ml.html
https://www.liquorama.net/barberino-alma-toscana-di-morellino-di-scansano-grappa-750ml.html
https://www.liquorama.net/barberino-alma-toscana-vin-santo-grappa-750ml.html
https://www.liquorama.net/barenjager-honey-bourbon-liqueur-germany.html
https://www.liquorama.net/barenjager-honey-liqueur-germany.html
https://www.liquorama.net/barr-an-uisce-wicklow-rare-small-batch-blended-irish-whiskey-750ml.html
https://www.liquorama.net/barrows-intense-ginger-liqueur-750ml.html
https://www.liquorama.net/bartenders-hot-sex-cocktail-750ml.html
https://www.liquorama.net/basil-haydens-dark-rye-whiskey-750ml.html
https://www.liquorama.net/basil-haydens-kentucky-straight-bourbon-whiskey-750ml.html


In [None]:
#Afterwards we must build a function that allows us to convert an html from the website to a BeautifulSoup object (BSO)

In [None]:
"https://www.liquorama.net/aperol-orange-aperitif-750ml.html"

In [361]:
class Scraper:
    
    def __init__(self, url):
        self.url= url
        
    def GetSoup(self):
        only_divs = SoupStrainer("div", attrs={'class':'productView-product'})
        r = requests.get(self.url)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text,'lxml', parse_only = only_divs)
         
    
#### Parse the prices of spirits (Why did the results are duplicate?)
    def scrape_prices(self):
        return soup.find('span', {'class':'price price--withoutTax'}).text
#### Parse the list name 
    def scrape_name(self):
        soup = GetSoup(self)
        return soup.find('h1', {'class':'productView-title'}).text
            
    def scrape_dict(self):
        soupArr = []
        for product in soup.findAll('li', {'class':'product'}):
            soupObject = {
                "name":scrape_names(),
                "price":scrape_prices()
        }
        soupArr.append(soupObject)
        return soupArr
    
    def getAllLinks(self):
        product = soup.findAll('li', {'class':'product'})
        href = soup.findAll('td', {'align':'center'})
        return [td.find('a')['href'] for td in tds]
        

In [363]:
test = Scraper("https://www.liquorama.net/aperol-orange-aperitif-750ml.html")
test.scrape_prices()

'$14.99'

In [278]:
test.

In [346]:
test.GetSoup().find('h1', {'class':'productView-title'}).text

'Aperol Orange Aperitif 750ml'

In [318]:
only_divs = SoupStrainer("div", attrs={'class':'productView-product'})

In [319]:
def sacasopa(link):
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text, 'lxml', parse_only = only_divs)

In [320]:
sacasopa("https://www.liquorama.net/aperol-orange-aperitif-750ml.html")

<!DOCTYPE html>
<div class="productView-product">
<h1 class="productView-title" itemprop="name">Aperol Orange Aperitif 750ml</h1>
<div class="productView-price">
<div class="price-section price-section--withoutTax " itemprop="offers" itemscope="" itemtype="http://schema.org/Offer">
<span class="price price--rrp" data-product-rrp-without-tax="">$28.99</span>
<meta content="25.99" itemprop="price"/>
<meta content="USD" itemprop="priceCurrency"/>
<span class="price price--withoutTax" data-product-price-without-tax="">$25.99</span>
</div>
<div class="price-section price-section--saving">
<span class="price">
                    (You save $3.00)
                </span>
</div>
</div>
<div class="productView-rating">
<span class="icon icon--ratingEmpty">
<svg>
<use xlink:href="#icon-star"></use>
</svg>
</span>
<span class="icon icon--ratingEmpty">
<svg>
<use xlink:href="#icon-star"></use>
</svg>
</span>
<span class="icon icon--ratingEmpty">
<svg>
<use xlink:href="#icon-star"></use>
</svg>
</s