In [1]:
import pandas as pd
import numpy as np
from time import time 
import tqdm

from bs4 import BeautifulSoup as bs
import requests

---
# all_ipa

In [2]:
def beer_style(url):

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }
 
    response = requests.get(url, headers=headers)
    soup = bs(response.content, "html.parser")
    original_text = str(soup.find("td").get_text())

    position = original_text.index("out of ") + 7
    num_result = int(original_text[position:-1]) # keep the string form for concatenate with link address
    print(f'{num_result} type: {type(num_result)}')

    df_first_page = result_2_df(soup)

    list_page_df = []
    list_page_df.append(df_first_page)

    # next pages results
    for page_num in range(50, 50*int(num_result/50), 50): # range(50, 50*int(num_result/50), 50)
        
        next_page = url + "?sort=revsD&start=" + str(page_num)
        soup_next_page = bs(requests.get(next_page, headers=headers).content, "html.parser")

        df_next_page = result_2_df(soup_next_page)
        list_page_df.append(df_next_page)
    
    new_result = pd.concat(list_page_df)

    ind = pd.Series(list(range(len(new_result))))
    new_result = new_result.set_index(ind)

    # delete result without rating
    new_result = new_result[(new_result.total_rate != "0") & (new_result.average_rate !="0")]
    
    return new_result

In [3]:
def result_2_df(soup):

    colname = ["beer_name", "total_rate", "average_rate", "beer_url", "manufacturer_url", 'latest_active_date']
    beer_name, total_rate, average_rate, beer_link, manufacturer_link, li_latest_date = beer_info(soup) # calling function "beer_info()"
    df_first_page = pd.DataFrame(data=[beer_name, total_rate, average_rate, beer_link, manufacturer_link, li_latest_date])
    df_first_page = df_first_page.transpose()
    df_first_page = df_first_page.set_axis(colname, axis=1)

    return df_first_page

In [4]:
def beer_info(soup): # 08 Jun 2023 Revised

    li_latest_date = []
    beer_link = []
    manufacturer_link = []
    all_b = []
    for search_result in soup.find_all("td", class_="hr_bottom_light"):

        for span in search_result.find_all("span"):
            if len(span.get_text()) == 10:
                li_latest_date.append(span.get_text())

        for i, a_content in enumerate(search_result.find_all("a")):
            href_url = a_content["href"]
            if i%2 != 0:
                [manufacturer_link.append("https://www.beeradvocate.com" + href_url) if 'profile' in href_url else None]
            else:
                [beer_link.append("https://www.beeradvocate.com" + href_url) if 'profile' in href_url else None]

        for b_content in search_result.find_all("b"):
            b = b_content.get_text().strip()
            if b != "":
                all_b.append(b)

    beer_name = []
    total_rate = []
    average_rate = []
    for i in range(0, len(all_b), 3):
        beer_name.append(all_b[i])
        total_rate.append(all_b[i+1])
        average_rate.append(all_b[i+2])

    return beer_name, total_rate, average_rate, beer_link, manufacturer_link, li_latest_date

In [5]:
# Next Page Comment
def all_rate(url, beer_name):
    
    comment_list = []
    for page_num in range(0, 120, 40):
        #print(page_num)
        comment_list.append(scrape_rate(url + '?view=beer&show=recent&start=' + str(page_num)))

    final_rate_df = comment_list[0]
    for i in range(len(comment_list)-1):
        final_rate_df = pd.concat([final_rate_df, comment_list[i+1]], ignore_index=True, sort=False)
    
    final_rate_df['beer_url'] = str(url)
    final_rate_df['beer_name'] = str(beer_name)

    return final_rate_df

In [6]:
def scrape_rate(one_beer_url): # 08 Jun 2023 Revised

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }
 
    response = requests.get(one_beer_url, headers=headers)
    soup = bs(response.content, "html.parser")

    search_results = soup.find_all("div", class_="user-comment")
    list_comment, list_review, list_username, list_date = ([] for i in range(4))
    for content in search_results:
        for text in content.find_all("a"):
            text_result = text.get_text()
            if (text_result != "") & (text_result != "Report"):
                if (("," in text_result) & (len(text_result)==12)) | ('day at ' in text_result) | (' minute' in text_result):
                    list_date.append(text_result)
                else:
                    list_username.append(text_result)

        for text in content.find_all("div", style="margin:20px 0px; font-size:11pt; line-height:1.4;"):
            list_comment.append(text.get_text())
        for text in content.find_all("div", id="rating_fullview_content_2"):
            list_review.append([rate for rate in text.stripped_strings])

    rate_score, rate_member, rate_location, rate_date = ([] for i in range(4)) # empty list for rate without comment 
    review_score, review_member, review_location, review_date = ([] for i in range(4)) # empty list for rate with comment
    for i in range(len(list_review)):
        if 'Rated:' in list_review[i]:
            rate_score.append(list_review[i][1])
            rate_member.append(list_username[i])
            rate_location.append(list_review[i][4])
            rate_date.append(list_date[i])
        else:
            review_score.append(list_review[i][3])
            review_member.append(list_username[i])
            review_location.append(list_review[i][2])
            review_date.append(list_date[i])
    # Rate without Comment
    colname = ["rate_member", "location", "rate_score", "rate_date"]
    df_rate = pd.DataFrame(data=[rate_member, rate_location, rate_score, rate_date])
    df_rate = df_rate.transpose()
    df_rate = df_rate.set_axis(colname, axis=1)
    # Rates with comment
    colname = ["rate_member", "location", "rate_score", "rate_date", "comment"]
    df_review = pd.DataFrame(data=[review_member, review_location, review_score, review_date, list_comment])
    df_review = df_review.transpose()
    df_review = df_review.set_axis(colname, axis=1)
    one_page_rate_df = pd.concat([df_rate, df_review], ignore_index=True, sort=False)

    return one_page_rate_df

In [7]:
#beer_style_url = "https://www.beeradvocate.com/beer/styles/199/"
beer_style_url = "https://www.beeradvocate.com/beer/styles/140/"

filename_list = 'beer/0303_imperial_ipa_list.csv'
filename_rate = 'beer/0303_imperial_ipa_rate.csv'

In [8]:
df_one_beer_seed_brut = beer_style(beer_style_url) 
df_one_beer_seed = df_one_beer_seed_brut.copy()

df_one_beer_seed.latest_active_date = pd.to_datetime(df_one_beer_seed.latest_active_date, format='%m-%d-%Y')
df_one_beer_seed["year"] = df_one_beer_seed.latest_active_date.dt.year
df_one_beer_seed["month"] = df_one_beer_seed.latest_active_date.dt.month
df_one_beer_seed["day"] = df_one_beer_seed.latest_active_date.dt.day
print(f'seed: {len(df_one_beer_seed)}')
df_one_beer = df_one_beer_seed[df_one_beer_seed['year'] >=2022] # Cut Down 2021, 2022, 2023 only
df_one_beer = df_one_beer.reset_index()
if len(df_one_beer) > 100:
    df_one_beer = df_one_beer.iloc[:100]
    
print(f'seed: {len(df_one_beer)}')

# Rename duplicated Beer Name
dict_beer_name = {}
for i in range(len(df_one_beer)):
    beer_name = df_one_beer.beer_name.iloc[i]
    if beer_name not in dict_beer_name:
        dict_beer_name[beer_name] = 1
    else:
        new_name = beer_name + " " + str(dict_beer_name[beer_name])
        # df_one_beer.beer_name.iloc[i] = new_name
        df_one_beer.loc[i, ["beer_name"]] = new_name
        dict_beer_name[beer_name] = dict_beer_name[beer_name] + 1

all_IPA_rate = pd.DataFrame()
for i in range(len(df_one_beer)):
    beer_url = df_one_beer.beer_url.iloc[i]
    beer_name = df_one_beer.beer_name.iloc[i]
    df_comment_1_beer = all_rate(beer_url, beer_name)
    all_IPA_rate = pd.concat([all_IPA_rate, df_comment_1_beer],  ignore_index=True, sort=False)

print(all_IPA_rate.shape)
am_ipa = all_IPA_rate.copy()

am_ipa = am_ipa.drop_duplicates(keep='first')
print(am_ipa.shape)

df_one_beer.to_csv(filename_list)
am_ipa.to_csv(filename_rate)

26890 type: <class 'int'>
seed: 23880
seed: 100
(6000, 7)
(2000, 7)


In [9]:
all_IPA_rate

Unnamed: 0,rate_member,location,rate_score,rate_date,comment,beer_url,beer_name
0,beernooph,from Connecticut,4.31,"Feb 24, 2024",,https://www.beeradvocate.com/beer/profile/64/2...,90 Minute IPA
1,Michael_Reese91,"Feb 20, 2024",4.75,"Feb 20, 2024",,https://www.beeradvocate.com/beer/profile/64/2...,90 Minute IPA
2,Vintage-Vee77,"Feb 17, 2024",4.59,"Feb 17, 2024",,https://www.beeradvocate.com/beer/profile/64/2...,90 Minute IPA
3,BLVBBERGIXLL,from Arizona,4.27,"Feb 09, 2024",,https://www.beeradvocate.com/beer/profile/64/2...,90 Minute IPA
4,GeraldBrew1985,"Feb 04, 2024",4.54,"Feb 04, 2024",,https://www.beeradvocate.com/beer/profile/64/2...,90 Minute IPA
...,...,...,...,...,...,...,...
5995,zotzot,from Vermont,4.46,"Mar 08, 2023",Drinking in a snifter at BlackBack Pub - still...,https://www.beeradvocate.com/beer/profile/2306...,Hop Venom Double IPA
5996,Pivopijak,from Washington,3.67,"Feb 16, 2023","draught\nFairly sustained, off-white head. Amb...",https://www.beeradvocate.com/beer/profile/2306...,Hop Venom Double IPA
5997,ivanbrew,5,/5,"Nov 20, 2022",I tried this from a can omg amazing imperial I...,https://www.beeradvocate.com/beer/profile/2306...,Hop Venom Double IPA
5998,ZebulonXZogg,from Illinois,4,"Sep 28, 2022","Copper pour, small head but left some nice lac...",https://www.beeradvocate.com/beer/profile/2306...,Hop Venom Double IPA


In [10]:
def clean_df(df):

    print(f'Before Clean: {df.shape}')
    df.location = df.location.map(lambda x: x[5:] if "from " in x else "0")
    df.rate_score[df.rate_score.str.contains("/")==True] = df.location[df.rate_score.str.contains("/")==True] # KEEP
    df.rate_score = df.rate_score.map(lambda x: float(x))

    df.rate_date = df.rate_date.map(lambda x: turn_date(x))

    df.rate_date = pd.to_datetime(df.rate_date, format='%b %d, %Y')
    df["year"] = df.rate_date.dt.year
    df["month"] = df.rate_date.dt.month
    df["day"] = df.rate_date.dt.day

    print(f"After Cleaning: {df.shape}")
    
    return df


def turn_date(rate_date):

    dict_day = {"Friday":"Mar 01, 2024", "Saturday":"Mar 02, 2024", "Monday":"Feb 26, 2024", 
                "Tuesday":"Feb 27, 2024", "Wednesday":"Feb 28, 2024", "Thursday":'Feb 29, 2024', 
                "Yesterday":"Mar 02, 2024", "Today":"Mar 03, 2024"}
    
    for item in dict_day:
        if item in rate_date:
            return dict_day[item]
    return rate_date


In [11]:
cc = pd.read_csv(filename_rate, index_col=0)
cc = clean_df(cc)
cc.sample(5, random_state=73)

Before Clean: (2000, 7)
After Cleaning: (2000, 10)


Unnamed: 0,rate_member,location,rate_score,rate_date,comment,beer_url,beer_name,year,month,day
3966,ScottyB515,Ohio,4.44,2023-07-04,,https://www.beeradvocate.com/beer/profile/341/...,Bodhi,2023,7,4
2775,bdoogy,Missouri,4.48,2023-06-11,I love hops and this is a treat. A little hard...,https://www.beeradvocate.com/beer/profile/147/...,Ruination Double IPA 2.0,2023,6,11
2175,BalancingBrooms,California,3.85,2020-02-01,,https://www.beeradvocate.com/beer/profile/199/...,Dorado,2020,2,1
5472,Red5StandingBy,Maryland,3.66,2018-07-06,,https://www.beeradvocate.com/beer/profile/1471...,Smells Like A Safety Meeting,2018,7,6
2178,brewme,Massachusetts,4.0,2021-03-20,Reminded of this beer by the How many Beers of...,https://www.beeradvocate.com/beer/profile/199/...,Dorado,2021,3,20


In [12]:
cc.to_csv(filename_rate)