### Import

In [None]:
import sqlite3
import requests
import re
from bs4 import BeautifulSoup, Comment
import json
import pandas as pd
import time
import folium
import matplotlib.pyplot as plt
import urllib.request
from selenium import webdriver
import random

In [None]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

### Web Scraping Beer Advocate

#### Selenium activation

In [None]:
#testing that Selenium works
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.nasa.gov')
headlines = driver.find_elements_by_class_name("headline")
for headline in headlines:
    print(headline.text.strip())
driver.close()

In [None]:
#estabishing the webdriver for chrome
driver = webdriver.Chrome()

In [None]:
#putting the driver on the website
driver.get('https://www.beeradvocate.com/beer/styles/9/')

In [None]:
#clicking into a beer
driver.find_element_by_xpath('//*[@id="ba-content"]/table/tbody/tr[4]/td[1]/a').click()

In [None]:
#pulling out the comments
comment = driver.find_element_by_xpath('//*[@id="rating_fullview_content_2"]').text
comment

In [None]:
#this function will let us go from page to page
def next_page():
    quarter = driver.find_element_by_tag_name('tbody')
    x = quarter.find_elements_by_tag_name('span')[1].find_elements_by_tag_name('a')[-2]
    x.click()

In [None]:
driver.close()

#### BeautifulSoup activation

In [None]:
#establish our url and our request.get
url = 'https://www.beeradvocate.com/beer/styles/9/'
response = requests.get(url)

In [None]:
#establishing beautifulsoup with the response variable using 'lxml' and then grabbing the table
bs = BeautifulSoup(response.content, 'lxml')
table = bs.table

In [None]:
#using the table we established to grab all the specific values out of that table
table_rows = table.find_all('tr')

for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text for i in td]
    print(row)

In [None]:
#here we put it into a df
df = pd.read_html(str(table))
df = df[0].dropna(axis= 0, thresh = 4)
df

In [None]:
#This allows us to create an accurate range for the function to iterate through
body = driver.find_element_by_tag_name('tbody')
x = body.find_element_by_tag_name('b').text
pages = int(''.join([i for i in x.split('(')[1] if i.isnumeric()])) // 50

#### Establish table with beer info

In [None]:
#creating a variable to put into the next function
long_list = [('Fruit and Field', 'https://www.beeradvocate.com/beer/styles/9/', pages)]

In [None]:
#this function will go through each page and generate the tables iteratively
#it has a sleep timer to avoid being blocked
data = []
for beer_style, url, pages in long_list:
    driver.get(url)
    for i in tqdm(range(pages)):
        print(i)
        table = driver.find_element_by_tag_name('table')
        tds = [x.find_elements_by_tag_name('td') for x in table.find_elements_by_tag_name('tr')][3:]
        for td in tds:
            data.append([beer_style] + [x.text for x in td] + [td[0].find_element_by_tag_name('a').get_attribute('href')])
            
        quarter = driver.find_element_by_tag_name('tbody')
        x = quarter.find_elements_by_tag_name('span')[1].find_elements_by_tag_name('a')[-2]
        x.click()
        time.sleep(random.choice([x/10 for x in range(8,14)]))

In [None]:
#turning the data into a dataframe
beer_df = pd.DataFrame(data)

In [None]:
#creating a benchmark csv in casae something happens
beer_df.to_csv(r'C:\Users\GMoneyMan\Documents\Flatiron\capstone\csv_data\beer_df.csv', index = False)

In [None]:
beer_df.isna().sum()

In [None]:
len(beer_df)

#### Collecting url for each beer profile

In [None]:
#this function will create a list of each of the beer's url and using BS go to the next page
new_list_links = []

for tag in beer_tags.find_all(href=re.compile("/beer/styles/[0-9]+/\?sort=revsD")):
        if tag.contents[0] == 'last':
            num = int(tag.get('href').strip(f"{url}?sort=revsD&start"))
some_num = 7206 #num+50
i = 0
for i in tqdm(range(0, some_num, 50)):
        print(f'i is {i}')
        url = f'https://www.beeradvocate.com/beer/styles/9/?sort=revsD&start={i}'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        tags = [tag.get('href') for tag in soup.find_all(href=re.compile("/beer/profile/[0-9]+/[0-9]+"))]
        new_list_links.append(tags)
        i += 50
        
        time.sleep(random.choice([x/10 for x in range(8,14)]))

In [None]:
new_list_links

In [None]:
#Since the function outputs a list full of lists this will turn everything into just one list
list_of_beer_profiles = [y for x in new_list_links for y in x]

#### Collecting the reviews for a beer

##### Method 1 to get reviews

In [None]:
tag_name = driver.find_element_by_xpath('//*[@id="ba-content"]/table/tbody/tr[4]/td[1]/a').text
tag_name

##### Method 2 to get reviews

In [None]:
ba_content = driver.find_element_by_xpath('//*[@id="ba-content"]').text
ba_content

##### Method 3 to get reviews

In [None]:
u_comment = driver.find_elements_by_class_name('user-comment')
review3 = []
for item in u_comment:
    review3.append(item.text)

#### Collecting the url & reviews

In [None]:
#creating a test for the urls to feed into the next function
first_trial = list_links[0][49:52]

In [None]:
#this function will take the list of urls collected previously and pull all the comments from
#the page, and append the text to a list
def get_beer_reviews(list_urls):
    list_beers = []
    i = 1
    for url_end in tqdm(list_urls):
        beer = {}
        url = f'https://www.beeradvocate.com{url_end}'
        beer['url'] = url
        print(i)
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser') 
        
        listy = soup.find_all(class_ = 'user-comment')
        beer['review'] = []
        for item in listy:
            beer['review'].append(item.get_text())
            
#         titlebar = soup.find(class_='titleBar')
#         for span in titlebar('span'):
#             span.decompose()

#         beer['name'] = titlebar.get_text().strip()
        
        list_beers.append(beer)
        
        time.sleep(random.choice([x/10 for x in range(8,14)]))
        
        i += 1
    return list_beers

In [None]:
#here we trial our sample of urls
demo_df = get_beer_reviews(first_trial)

In [None]:
pd.DataFrame(demo_df)

In [None]:
len(list_of_beer_profiles)

In [None]:
#here we use our whole list of urls (this takes a long time)
reviews = get_beer_reviews(list_of_beer_profiles)

In [None]:
#we convert into a dataframe
beer_reviews = pd.DataFrame(reviews)
beer_reviews

In [None]:
beer_reviews['review'][7201]

In [None]:
#this converts all the empty lists in our review column into nan values
beer_reviews.loc[~beer_reviews.review.astype(bool),'review']=np.nan

In [None]:
beer_reviews.isna().sum()

In [None]:
beer_reviews.to_csv(r'C:\Users\GMoneyMan\Documents\Flatiron\capstone\csv_data\reviews.csv', index = False)

In [None]:
#here we merge the reviews to the beer dataframe on the url column
df = beer_df.merge(beer_reviews, on='url', how='left')
df.to_csv(r'C:\Users\GMoneyMan\Documents\Flatiron\capstone\csv_data\beer_w_reviews.csv', index = False)

In [None]:
df

In [None]:
#we renaming the columns
df.columns = ['beer_style', 'beer_name', 'brewery_name', 'abv', 'no_ratings', 'avg_rating', 'drop_me', 'url']