In [1]:
# The purpose of the following project is to get some insight on spirit prices. 
# This notebook contains a parser whose main objecetive is to parse the data from "liquorama", an online brewery
# store from California. This script will extract information about their beer listings and obtain data regarding characteristics
# such as price, brand, reviews ETCETERA. It will store the resulting objects into a JSON dictionary that will be transformed
# in a pandas dataframe in 


#First it will import the required libraries in order to scrape the data succesfully

In [2]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup, SoupStrainer
from urllib.request import urlopen, Request
import json

In [3]:
print(requests.get("https://www.liquorama.net/robots.txt").text)

User-agent: *
Disallow: /search.php
User-agent: *
Disallow: /*?_bc_fsnf=1*
Disallow: /*&_bc_fsnf=1*



In [None]:
#The connection to the website was succesful and allows for scraping

In [None]:
################################################

# Disallow: /?_bc_fsnf=1 — This blocks bots from following faceted search links and causing performance issues.

# Disallow: /&_bc_fsnf=1 — This blocks bots from following faceted search links and causing performance issues.

# Disallow: /search.php — This page handles searches from the search box on a store. Google has previously stated 
#                        that search results pages are not something they want in their index because it creates a poor 
#                        user experience (going from a search results page to another search results page instead of going 
#                        directly to the result).
#
#
# Source: Big Commerce (Help Center), "Understanding the Robots.txt file", 
# https://support.bigcommerce.com/s/article/Understanding-the-Robots-txt-File
################################################

In [None]:
# The purpose of this scraper is to get the general information of Liquorama's beer listing. The online store has a 21 page listing 
# of beers for different size, price, country of origin, reviews and type. 

# Firstly, we need the complete set of links in an object for further parsing.

In [96]:
def GetSoup(link):
        r = requests.get(link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text,'lxml')
    

In [303]:
def GetLinks(link):                     
    soup = GetSoup(link)
    l = []
    for link in soup.findAll('a', attrs={"class":"product-images"}):
        l.append(link.get('href'))
    return l

In [748]:
linqs = []

for i in range(1,9):
    linqs.append("https://uptownspirits.com/shop/liquor/page/" + str(i) + "/?product_count=144")

In [671]:
products =[]

for link in linqs:
    products.append(GetLinks(link))
    

In [732]:
prods = [item for sublist in products for item in sublist]

In [None]:
#Afterwards we must build a function that allows us to convert an html from the website to a BeautifulSoup object (BSO)

In [79]:
##### HOW CAN I ADD ANOTHER ELEMENT WITHOUT SCARIFICING TIME AND MEMORY?

In [None]:
##### triunfando en OOP

In [737]:
class Scraper:
    
    def __init__(self):
        self.dataall = {}
        
    def GetSoup(self, url):
        r = requests.get(url)
        r.encoding = 'UTF-8'
        self.soup  = BeautifulSoup(r.text,'lxml')
        
    def ParseSKU(self):
        el = self.soup.find('span', attrs={'class':'sku'})
        self.SKU = el.text if el else "NA"
    
    def ParseName(self):
        self.title = self.soup.find('h1', attrs={'class':'product_title entry-title'}).text

    def ParsePrice(self):
        self.price = self.soup.find('span', attrs={'class':'woocommerce-Price-amount amount'}).text
    
    def ParseBrand(self):
        el = self.soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_brand'})
        self.brand = el.text if el else "NA"
    
    def ParseAvailability(self):
        el = self.soup.find('div', attrs={'class':'avada-availability'})
        self.availability = el.text if el else "NA"
        
    
    def Region(self):
        el = self.soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_region'})
        ele = el.text if el else "NA"
        self.region = ele.replace("Region","")
    
    def ParseWeight(self):
        el = self.soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--weight'})
        ele = el.text if el else "NA"
        self.weight = ele.replace('Weight', '')
        
    def ParseAlcType(self):
        el = self.soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_alcohol-type'})
        ele = el.text if el else "NA"
        self.alctype = ele.replace('Alcohol Type', "")
    
    def Clas(self):
        el = self.soup.find('tr', attrs={'class':"woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_class"})
        ele = el.text if el else "NA"
        self.clas = ele.replace("Class","")
    
    def Proof(self):
        el = self.soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_proof'})
        ele = el.text if el else "NA"
        self.proof = ele.replace('Proof','')
   
    

    
        
    def oneDic(self):
        '''
        a dictionary of data for one link
        '''
        self.dic = {}
        
        self.dic['SKU'] = self.SKU
        self.dic['Name'] = self.title
        self.dic['Price'] = self.price
        self.dic['Brand'] = self.brand
        self.dic['Availability'] = self.availability
        self.dic['Region'] = self.region
        self.dic['Weight'] = self.weight
        self.dic['Type'] = self.alctype
        self.dic['Class'] = self.clas
        self.dic['Proof'] = self.proof
        
        #append it to all dataframe
        self.dataall[self.title] = self.dic

In [738]:
ws = Scraper()

In [741]:
for link in prods:
    ws.GetSoup(link)
    ws.ParseSKU()
    ws.ParseName()
    ws.ParsePrice()
    ws.ParseBrand()
    ws.ParseAvailability()
    ws.Region()
    ws.ParseWeight()
    ws.ParseAlcType()
    ws.Clas()
    ws.Proof()
    ws.oneDic()

In [743]:
df = pd.DataFrame.from_dict(ws.dataall).transpose()

In [752]:
export = df.to_csv("dataframe.csv")

In [202]:
##### parte 3 #### 

In [654]:
def ParseName(link):
    soup = GetSoup(link)
    return soup.find('h1', attrs={'class':'product_title entry-title'}).text

In [679]:
def ParsePrice(link):
    soup = GetSoup(link)
    el = soup.find('span', attrs={'class':'woocommerce-Price-amount amount'})
    return el.text if el else "NA"

In [680]:
def ParseSKU(link):
    soup = GetSoup(link)
    el = soup.find('span', attrs={'class':'sku'})
    return el.text if el else "NA"

In [681]:
def ParseWeight(link):
    soup = GetSoup(link)
    el = soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--weight'}).text
    return el.replace('Weight', '')

In [682]:
def ParseBrand(link):
    soup = GetSoup(link)
    el = soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_brand'})
    ele = el.text if el else "NA"
    return ele.replace("Brand", "")

In [683]:
def ParseAlcType(link):
    soup = GetSoup(link)
    el = soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_alcohol-type'})
    ele = el.text if el else "NA"
    return ele.replace('Alcohol Type', "")

In [745]:
def Class(link):
    soup = GetSoup(link)
    el = soup.find('tr', attrs={'class':"woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_class"})
    ele = el.text if el else "NA"
    return ele.replace("Class","")

In [685]:
def Region(link):
    soup = GetSoup(link)
    el = soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_region'})
    ele = el.text if el else "NA"
    return ele.replace("Region","")


In [747]:
def Proof(link):
    soup = GetSoup(link)
    el = soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_proof'})
    ele = el.text if el else "NA"
    return ele.replace('Proof','')

In [746]:
def Availability(link):
    soup = GetSoup(link)
    el = soup.find('div', attrs={'class':'avada-availability'})
    return el.text if el else "NA"

In [677]:
def Star(link):
    soup = GetSoup(link)
    return soup.find('span', attrs={'class':'yotpo-stars'})
    