In [None]:
'''
Introduction

This Python script scrapes the beer advocate website and downloads selected information for the beers listed on the first pages 
(current 4) of each style. Each style page has about 50 links, for an estimated 200 (max) links per beer style. 

For each of the beers scraped, it saves the picture of the beer, the beer ratings, beer information, and names/company which 
produces the beer. The script then produces high level summaries of the information which it scrapes. 
'''

In [24]:
#Import Libraries
from urllib.request import Request, urlopen, urlretrieve, URLopener
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
import re
import pandas as pd
import csv

import logging
import traceback
import os


In [2]:
#User Defined Functions:

#Get Style Links
def Beer_Style_Links():
    '''
    This function scrapes all of the 'beer style' links from Beer Advocate's style page. It then saves those 
    links to a list which is returned. 
    
    This function takes no parameters.
    '''
    try:
        #Open the site and load the BeautifulSoup object
        site= "https://www.beeradvocate.com/beer/style/"
        hdr = {'User-Agent': 'Mozilla/5.0'}
        req = Request(site,headers=hdr)
        html = urlopen(req)
        bsObj = BeautifulSoup(html.read(), "lxml")
    except Exception as e:
        #Save any errors to the log
        logger.error(str(e))
        logger.error(str(site))
    else:
        #Process the BeautifulSoup object and create a list of beer sylte links
        names = bsObj.find("div", {"id":"ba-content"}).findAll("a", href=re.compile("^(/beer/style/)((?!:).)*$"))
        links = []

        for i in range(len(names)-1):
            links.append(names[i].attrs["href"])

        return links
    
#Get Beer Links
def Beer_Links(link, depth, beer_links):
    '''
    This function takes a beer style link and saves all of the beer profile links from the page to a list. it is set
    so that it will go to the 'next page' of the beers for that style up to the 'depth' number of pages. 
    If depth = 3 then it will scrape the first three pages of the beer style. The function returns a list
    of beer links.
    
    Parameters:
        link = is the beer style link which should be scraped
        depth = is the number of pages the fucntion should scrape for beer links
        beer_links = is the list which function should append the scrapped beer links to.
    '''
    page = link
    j = 0
    
    while j < depth:
        try:
            #Open the site and load the BeautifulSoup object
            site= "https://www.beeradvocate.com" + str(page)
            hdr = {'User-Agent': 'Mozilla/5.0'}
            req = Request(site,headers=hdr)
            html = urlopen(req)
            bsObj = BeautifulSoup(html.read(), "lxml")
        except Exception as e:
            #Save any errors to the log
            logger.error(str(e))
            logger.error(str(site))
        else:
            #Process the BeautifulSoup object and create append
            # the beer links to the list
            names = bsObj.find("div", {"id":"ba-content"}).findAll("a", href=re.compile("^(/beer/profile/[0-9]+/[0-9]+)((?!:).)*$"))

            for i in range(len(names)-1):
                beer_links.append(names[i].attrs["href"])

        #get the next page link(s) from the page
        next_link = bsObj.find("div", {"id":"ba-content"}).findAll("a")

        next_page = []
        for h in range(len(next_link)-1):
            if next_link[h].getText() == 'next':
                next_page.append(next_link[h])

        #if there are multiple links with the text next, take the first one.
        if len(next_page) > 1:
            next_page = next_page[0].attrs["href"]
        page = next_page
        j += 1

#Get Beer Information
def Beer_Info(page, writer):
    '''
    This function scrapes the selected information for a specific beer profile and saves it to a .csv file. It also
    saves the image to a BeerImages folder which the user needs to create prior to running the code. 
    
    The function scrapes the:
        - beer name
        - beer stats
        - beer information
        - beer score
    sections of the beer profile.
    
    Parameters
        - page = the beer page to be scraped
        - writer = the writer object which the output should be saved to.
    '''
    try:
        #Open the site and load the BeautifulSoup object
        site= "https://www.beeradvocate.com" + page
        hdr = {'User-Agent': 'Mozilla/5.0'}
        req = Request(site,headers=hdr)
        html = urlopen(req)
        bsObj = BeautifulSoup(html.read(), "lxml")
    except Exception as e:
        #Save any errors to the log
        logger.error(str(e))
        logger.error(str(site))
    else:
        #Process the BeautifulSoup object
        try:
            #Save the image to disk
            image = bsObj.find("div", {"id":"info_box"}).find('img')['src']
            name = bsObj.find("div", {"id":"info_box"}).find('img')['alt']
            name = re.sub(r'[^a-zA-Z0-9]+','', name)

            req = Request(image,headers=hdr)
            resource = urlopen(req)
            output = open("BeerImages/" + name +".jpg","wb")
            output.write(resource.read())
            output.close() 
        except Exception as e:
            #Save any errors to the log
            logger.error(str(e))
            logger.error(str(image))

        #Scrape the selecte data from the webpage
        item = {}
        item['name'] = re.sub('\n|\t', '||', bsObj.find("h1").getText())
        item['score'] = re.sub('\n|\t', '||', bsObj.find("div", {"id":"score_box"}).getText())  
        item['stats'] = re.sub('\n|\t', '||', bsObj.find("div", {"id":"stats_box"}).getText())  
        item['info'] =  re.sub('\n|\t', '||', bsObj.find("div", {"id":"info_box"}).getText())
        dta = pd.Series(item, name='item')
        dta = dta.str.encode('utf-8') 
        
        writer.writerow(dta)

#Error Logging
def error_log():
    '''
    This function creates an error logger which is used to track the exceptions caught by the function. 
    It saves a text file in the folder which contains the source code.
    '''
    global logger
    logger = logging.getLogger("File Log")
    logger.setLevel(logging.ERROR)
    handler = logging.FileHandler("log.txt", mode='a', encoding=None, delay=False)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)        
        
#Execution Function
def BeerAdvocateScrap(limit=True):
    '''
    This function executes the BeerAdvocate.com scrapping job.
    
    Paramters:
        Limit = Should the function only scrape the first three style links to test (if True) or all styles (if False). 
                Default = True.
    '''
    
    #Create directory for beer images
    file_path = "/BeerImages"
    directory = os.path.dirname(file_path)

    try:
        os.stat(directory)
    except:
        os.mkdir(directory) 

    
    #Create error log
    error_log()
    
    #Get all style links
    style_links = Beer_Style_Links()
    print(str(len(style_links)) + " Total Style Links") #print total styles
    
    #Get beer links
    if limit:
        beer_links = []
        print("Limited to first 3 styles")
        for i in range(4):
            Beer_Links(style_links[i], 4, beer_links)
    else:
        beer_links = []
        print("Collecting all Styles")
        for i in range(len(style_links)-1):
            Beer_Links(style_links[i], 4, beer_links)

    print(str(len(beer_links)) + " Total Beer Links")

    #Create csv file to save scrapped data in.
    csvFile = open("BeerInformation.csv",'w', newline='')
    #Scrape beer links
    try:
        writer = csv.writer(csvFile)
        for i in range(len(beer_links)-1):
            if i % 50 == 0:
                print("Link Number: " + str(i))
            Beer_Info(beer_links[i], writer)    
    finally:
        #Close csv file.
        csvFile.close()



In [4]:
#Execute the Scrapping job
BeerAdvocateScrap(limit=False)

103 Total Style Links
Collecting all Styles
18833 Total Beer Links
Link Number: 0
Link Number: 50
Link Number: 100
Link Number: 150
Link Number: 200
Link Number: 250
Link Number: 300
Link Number: 350
Link Number: 400
Link Number: 450
Link Number: 500
Link Number: 550
Link Number: 600
Link Number: 650
Link Number: 700
Link Number: 750
Link Number: 800
Link Number: 850
Link Number: 900
Link Number: 950
Link Number: 1000
Link Number: 1050
Link Number: 1100
Link Number: 1150
Link Number: 1200
Link Number: 1250
Link Number: 1300
Link Number: 1350
Link Number: 1400
Link Number: 1450
Link Number: 1500
Link Number: 1550
Link Number: 1600
Link Number: 1650
Link Number: 1700
Link Number: 1750
Link Number: 1800
Link Number: 1850
Link Number: 1900
Link Number: 1950
Link Number: 2000
Link Number: 2050
Link Number: 2100
Link Number: 2150
Link Number: 2200
Link Number: 2250
Link Number: 2300
Link Number: 2350
Link Number: 2400
Link Number: 2450
Link Number: 2500
Link Number: 2550
Link Number: 2600
Li

In [None]:
'''Below is the data cleaning process along with high level data summarization'''

In [3]:
####Import the data
data = pd.read_csv('BeerInformation.csv', names=(1,2,3,4))
print(type(data))

print()
print(data.head())

print()
print(data.columns)

#Subset each column for indiviual processing
c1 = data.loc[:,1]
c2 = data.loc[:,2]
c3 = data.loc[:,3]
c4 = data.loc[:,4]

#Print top five row of each series
print()
print(c1[:5])

print()
print(c2[:5])

print()
print(c3[:5])

print()
print(c4[:5])

<class 'pandas.core.frame.DataFrame'>

                                                   1  \
0  b'||||BEER INFO||||Brewed by:||||New Belgium B...   
1  b"||||BEER INFO||||Brewed by:||||Tr\xc3\xb6egs...   
2  b'||||BEER INFO||||Brewed by:||||Green Flash B...   
3  b"||||BEER INFO||||Brewed by:||||Bell's Brewer...   
4  b'||||BEER INFO||||Brewed by:||||Tr\xc3\xb6egs...   

                                                   2  \
0        b'Fat Tire Amber Ale | New Belgium Brewing'   
1  b'Tr\xc3\xb6egs Nugget Nectar | Tr\xc3\xb6egs ...   
2  b'Green Flash Hop Head Red Ale | Green Flash B...   
3         b"Bell's Amber Ale | Bell's Brewery, Inc."   
4  b'Tr\xc3\xb6egs Hopback Amber Ale | Tr\xc3\xb6...   

                                                   3  \
0   b'||BA SCORE ||||3.6/5||Good||8,777 Ratings||||'   
1  b'||BA SCORE ||||4.32/5||Outstanding||8,569 Ra...   
2  b'||BA SCORE ||||3.97/5||Very Good||3,407 Rati...   
3  b'||BA SCORE ||||3.84/5||Very Good||3,145 Rati...   
4  b'||

In [33]:
#Clean C1: Beer Info

#Standardize deliminters
regex_pat = re.compile(r'([||]+)') 
c1_1 = c1.str.replace(regex_pat,'||') 

#Split the text  by delimiter into columns
c1_2 = [p.split('||') for p in c1_1.values]

#Cast as a dataframe
df = pd.DataFrame(c1_2)

#select only required columns
df1 = df[[3,5,6,7,9]]
print(df1.columns)

#Clean each column
regex_pat = re.compile(r'(Style:)')
df1.loc[:,5] = df1.loc[:,5].str.replace(regex_pat,'')

regex_pat = re.compile(r'([Alcohol by volume (ABV): %])')
df1.loc[:,6] = df1.loc[:,6].str.replace(regex_pat,'')

regex_pat = re.compile(r'(Availability:)')
df1.loc[:,7] = df1.loc[:,7].str.replace(regex_pat,'')

#rename columns
df1.columns = ['company', 'style', 'abv', 'availability', 'notes']
print(df1.head(2))

Int64Index([3, 5, 6, 7, 9], dtype='int64')
                         company                      style   abv  \
0            New Belgium Brewing   American Amber / Red Ale  5.20   
1  Tr\xc3\xb6egs Brewing Company   American Amber / Red Ale  7.50   

  availability                                              notes  
0   Year-round  No notes at this time.Added by kbub6f on 10-07...  
1       Spring  Squeeze those hops for all they're worth! Nugg...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [5]:
#Clean C2: Beer Name

#Standardize deliminters
regex_pat = re.compile(r'([||]+)')
c2_1 = c2.str.replace(regex_pat,'||')

#Clean data
regex_pat = re.compile(r'(b\')')
c2_1 = c2_1.str.replace(regex_pat,'')

regex_pat = re.compile(r'(")')
c2_1 = c2_1.str.replace(regex_pat,'')

#Split the text  by delimiter into columns

c2_1 = [p.split('||') for p in c2_1.values]

#Cast as a dataframe
df2 = pd.DataFrame(c2_1)

#select only required columns
df2 = df2[[0,1]]

#rename columns
df2.columns = ['beer_name', 'brewery']

print(df2.head())

                          beer_name                          brewery
0               Fat Tire Amber Ale              New Belgium Brewing'
1      Tr\xc3\xb6egs Nugget Nectar    Tr\xc3\xb6egs Brewing Company'
2     Green Flash Hop Head Red Ale          Green Flash Brewing Co.'
3                bBell's Amber Ale              Bell's Brewery, Inc.
4  Tr\xc3\xb6egs Hopback Amber Ale    Tr\xc3\xb6egs Brewing Company'


In [6]:
#Clean C3: Beer Rating

#Standardize deliminters
regex_pat = re.compile(r'([||]+)')
c3_1 = c3.str.replace(regex_pat,'||')

#Clean each column
regex_pat = re.compile(r'(b\')')
c3_1 = c3_1.str.replace(regex_pat,'')

regex_pat = re.compile(r'([/5])')
c3_1 = c3_1.str.replace(regex_pat,'')

regex_pat = re.compile(r'(Ratings)')
c3_1 = c3_1.str.replace(regex_pat,'')

regex_pat = re.compile(r'([,])')
c3_1 = c3_1.str.replace(regex_pat,'')

#Split the text  by delimiter into columns
c3_1 = [p.split('||') for p in c3_1.values]

#Cast as a dataframe
df3 = pd.DataFrame(c3_1)

#select only required columns
df3 = df3[[2,3,4]]

#rename columns
df3.columns = ['rating','rating_cat', 'number_rating']

print(df3.head())

  rating   rating_cat number_rating
0    3.6         Good         8777 
1   4.32  Outstanding          869 
2   3.97    Very Good         3407 
3   3.84    Very Good          314 
4   3.97    Very Good         3106 


In [7]:
#Clean C2: Beer Stats

#Standardize deliminters
regex_pat = re.compile(r'([||]+)')
c4_1 = c4.str.replace(regex_pat,'||')

#Clean each column
regex_pat = re.compile(r'(b\')')
c4_1 = c4_1.str.replace(regex_pat,'')

regex_pat = re.compile(r'([#,%])')
c4_1 = c4_1.str.replace(regex_pat,'')

#Split the text  by delimiter into columns
c4_1 = [p.split('||') for p in c4_1.values]

#Cast as a dataframe
df4 = pd.DataFrame(c4_1)

#select only required columns
df4 = df4[[3,5,7,9,11,15,17,19]]

#rename columns
df4.columns = ['ranking','reviews', 'ratings','pdev','bro_score','wants','gots','trade']

print(df4.head())

  ranking reviews ratings   pdev bro_score wants  gots trade
0   38156    2058    8777  14.72      3.6    244  2609     1
1     418    2551    8569   9.49        0   1427  1915    25
2    4405     964    3407  10.83        4    173   539     2
3   10953     980    3145  11.98     4.35    160   600     2
4    4417    1174    3106  10.58        0    127   588     5


In [8]:
#Merge the four dataframes and cast columns to numeric as appropriate
df = pd.concat([df1, df2, df3, df4], axis=1)

#Cast numeric columns to numeric from string
df['abv'] = pd.to_numeric(df['abv'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['number_rating'] = pd.to_numeric(df['number_rating'], errors='coerce')
df['ranking'] = pd.to_numeric(df['ranking'], errors='coerce')
df['reviews'] = pd.to_numeric(df['reviews'], errors='coerce')
df['ratings'] = pd.to_numeric(df['ratings'], errors='coerce')
df['pdev'] = pd.to_numeric(df['pdev'], errors='coerce')
df['bro_score'] = pd.to_numeric(df['bro_score'], errors='coerce')
df['wants'] = pd.to_numeric(df['wants'], errors='coerce')
df['gots'] = pd.to_numeric(df['gots'], errors='coerce')
df['trade'] = pd.to_numeric(df['trade'], errors='coerce')


print(df.head(2))
print()

print(df.columns)
print()

print(df.shape)
print()

print(df.dtypes)

                         company                      style  abv availability  \
0            New Belgium Brewing   American Amber / Red Ale  5.2   Year-round   
1  Tr\xc3\xb6egs Brewing Company   American Amber / Red Ale  7.5       Spring   

                                               notes  \
0  No notes at this time.Added by kbub6f on 10-07...   
1  Squeeze those hops for all they're worth! Nugg...   

                      beer_name                          brewery  rating  \
0           Fat Tire Amber Ale              New Belgium Brewing'    3.60   
1  Tr\xc3\xb6egs Nugget Nectar    Tr\xc3\xb6egs Brewing Company'    4.32   

    rating_cat  number_rating  ranking  reviews  ratings   pdev  bro_score  \
0         Good         8777.0  38156.0     2058     8777  14.72        3.6   
1  Outstanding          869.0    418.0     2551     8569   9.49        0.0   

   wants  gots  trade  
0    244  2609      1  
1   1427  1915     25  

Index(['company', 'style', 'abv', 'availability', 

In [9]:
'''Summaries'''

'Summaries'

In [11]:
#Counts by Style
counts = df['style'].value_counts()

print("top 20 beer styles by beer count")
print(counts[:20])

print()
print("bottom 20 beer styles by beer count")
print(counts[-20:])

'''
By stype, the top beers appear to all have the same general count. This is expected as the web scrapper only took 
the first four pages of the style. The depth of the scrape would need to be set deeper to determine the beer with the 
most beer styles.

For the beers with the least styles however, just scrapping the first four pages did provide enough information. 
The beer styles with the least beers listed include :
 - Faro, Happoshu, Sahti, Black and Tan, and Eisbock
'''

top 20 beer styles by beer count
 Old Ale                             197
 Maibock / Helles Bock               196
 American Malt Liquor                196
 Flanders Red Ale                    196
 American Double / Imperial Stout    196
 Rauchbier                           196
 English Bitter                      196
 Witbier                             196
 Pumpkin Ale                         196
 Light Lager                         196
 American Double / Imperial IPA      196
 Euro Pale Lager                     196
 German Pilsener                     196
 Weizenbock                          196
 Scotch Ale / Wee Heavy              196
 Gose                                196
 Schwarzbier                         196
 Belgian Dark Ale                    196
 M\xc3\xa4rzen / Oktoberfest         196
 English Brown Ale                   196
Name: style, dtype: int64

bottom 20 beer styles by beer count
 Euro Strong Lager                                196
 Cream Ale                    

'\nBy stype, the top beers appear to all have the same general count. This is expected as the web scrapper only took \nthe first four pages of the style. The depth of the scrape would need to be set deeper to determine the beer with the \nmost beer styles.\n\nFor the beers with the least styles however, just scrapping the first four pages did provide enough information. \nThe beer styles with the least beers listed include :\n - Faro, Happoshu, Sahti, Black and Tan, and Eisbock\n'

In [13]:
#Counts by Company
counts = df['company'].value_counts()

print('Top 20 companies by beer count')
print(counts[:20])

'''
Of the beers listed on the first four pages of the styles, Bostom Beer Company has the most listed.
'''

print()
print('Bottom 20 companies by beer count')
print(counts[-20:])

'''
There are many companies with only one beer listed.
'''

Top 20 companies by beer count
Boston Beer Company (Samuel Adams)        99
Anheuser-Busch                            85
Short's Brewing Company                   85
Mikkeller ApS                             82
Goose Island Beer Co.                     72
Victory Brewing Company - Downingtown     70
New Belgium Brewing                       70
The Bruery                                69
Matt Brewing Company / Saranac Brewery    67
Sierra Nevada Brewing Co.                 64
Cigar City Brewing                        61
3 Floyds Brewing Co.                      58
Rogue Ales                                58
Pipeworks Brewing Company                 58
Ballast Point Brewing Company             55
Bell's Brewery, Inc.                      49
The Schlafly Tap Room                     49
Hill Farmstead Brewery                    49
Dogfish Head Craft Brewery                48
Revolution Brewing                        48
Name: company, dtype: int64

Bottom 20 companies by beer count
Lawson

'\nThere are many companies with only one beer listed.\n'

In [14]:
#Counts by Availability
counts = df['availability'].value_counts()

print('Beer Availability')
print(counts)

'''
Most beers scrapped are offered year round. A large number of beers are offered on a rotating basis.
'''

Beer Availability
 Year-round               8252
 Rotating                 6907
 Winter                   1207
 Fall                      855
 Summer                    565
 Spring                    536
 Limited (brewed once)     509
Name: availability, dtype: int64


'\nMost beers scrapped are offered year round. A large number of beers are offered on a rotating basis.\n'

In [18]:
#Counts by Rating Category
counts = df['rating_cat'].value_counts()

print("Beer Ratings (categorical)")
print(counts)

counts = df['rating_cat'].value_counts(normalize=True)

'''
Only 204 (1%) beers are reated as world class. 50% of beers are rated as good/very good.
'''

print()
print("Beer Ratings (categorical)")
print(counts)


Beer Ratings (categorical)
Very Good      5460
Good           4868
Okay           3198
Exceptional    2642
Poor           1224
Outstanding     872
0               226
World-Class     204
Awful           137
Name: rating_cat, dtype: int64

Beer Ratings (categorical)
Very Good      0.289947
Good           0.258510
Okay           0.169826
Exceptional    0.140301
Poor           0.064999
Outstanding    0.046307
0              0.012001
World-Class    0.010833
Awful          0.007275
Name: rating_cat, dtype: float64


In [37]:
#Summary stats by abv
print("Summary of Alcohol By Volume")
print(df['abv'].describe())

Summary of Alcohol By Volume
count    18036.000000
mean         6.507427
std          2.305773
min          0.010000
25%          5.000000
50%          6.000000
75%          7.500000
max         57.500000
Name: abv, dtype: float64


In [19]:
#Summary stats by rating
print("Summary by Rating (out of 5)")
print(df['rating'].describe())

'''
The mean rating is 3.6 and the median is 3.74 out of 5.
'''

Summary by Rating (out of 5)
count    18826.000000
mean         3.610229
std          0.608530
min          0.000000
25%          3.400000
50%          3.740000
75%          3.930000
max          4.980000
Name: rating, dtype: float64


'\nThe mean rating is 3.6 and the median is 3.74 out of 5.\n'

In [20]:
#Summary stats by number of ratings
print('Summary of the number of ratings')
print(df['number_rating'].describe())

'''
The mean number of ratings is 226 per beer while the median is only 38. The number of ratings is very scewed with the 
beer with the max number of ratings having over 16K ratings.
'''

Summary of the number of ratings
count    17587.000000
mean       226.960823
std        713.351119
min          0.000000
25%         11.000000
50%         38.000000
75%        140.000000
max      16769.000000
Name: number_rating, dtype: float64


'\nThe mean number of ratings is 226 per beer while the median is only 38. The number of ratings is very scewed with the \nbeer with the max number of ratings having over 16K ratings.\n'

In [21]:
#Summary stats by abv
print("Summary of number of Reviews")
print(df['reviews'].describe())

'''
The number of reviews is right skewed. The mean is higher than the median. Most beers get only a handful of reviews (17) or
less though a handful get a lot (over 100 or even 1000)
'''

Summary of number of Reviews
count    18831.000000
mean        89.779778
std        242.458646
min          0.000000
25%          4.000000
50%         17.000000
75%         63.000000
max       3925.000000
Name: reviews, dtype: float64


'\nThe number of reviews is right skewed. The mean is higher than the median. Most beers get only a handful of reviews (17) or\nless though a handful get a lot (over 100 or even 1000)\n'

In [20]:
#Summary stats by rating
print("Summary of Ratings")
print(df['ratings'].describe())

'''
The number of reviews is right skewed. The mean is higher than the median. Most beers get only a handful of reviews (55) or
less though a handful get a lot (over 200 or even 1000)
'''

count    18831.000000
mean       292.985715
std        829.072475
min          0.000000
25%         15.000000
50%         55.000000
75%        197.000000
max      16769.000000
Name: ratings, dtype: float64


In [22]:
#Weighted Average Rating by Style
grouped = df.groupby('style')

def wavg(group):
    d = group['rating']
    w = group['ratings']
    return (d * w).sum() / w.sum()

wa = grouped.apply(wavg).sort_values(ascending=False)

print("Top 20 Styles")
print(wa[:20])

print()
print("Bottom 20 Styles")
print(wa[-20:])

'''
The top rated styles have on average a score of 4 or more. While the lowest rated styles have a rating of 3 or less.

The top rated styles are Imperial Stout/Gueze, Imperial IPA, and Wild Ale.
'''

Top 20 Styles
style
 American Double / Imperial Stout                 4.269310
 Gueuze                                           4.237785
 American Double / Imperial IPA                   4.229711
 American Wild Ale                                4.225274
 Russian Imperial Stout                           4.210564
 Quadrupel (Quad)                                 4.176036
 Eisbock                                          4.157520
 Flanders Red Ale                                 4.150798
 Lambic - Unblended                               4.138707
 Lambic - Fruit                                   4.125005
 Weizenbock                                       4.098292
 American Strong Ale                              4.086044
 English Barleywine                               4.084130
 American IPA                                     4.061940
 Bi\xc3\xa8re de Champagne / Bi\xc3\xa8re Brut    4.050740
 Belgian Strong Dark Ale                          4.036254
 American Barleywine                

'\nThe top rated styles have on average a score of 4 or more. While the lowest rated styles have a rating of 3 or less.\n\nThe top rated styles are Imperial Stout/Gueze, Imperial IPA, and Wild Ale.\n'

In [23]:
#Weighted Average Rating by Company
grouped = df.groupby('company')

def wavg(group):
    d = group['rating']
    w = group['ratings']+.00000000000000000000000001
    return (d * w).sum() / w.sum()

wa = grouped.apply(wavg).sort_values(ascending=False)

print("Top 20 Company")
print(wa[:20])

print()
print("Bottom 20 Company")
print(wa[-20:])

'''
The top rate companies all appear to be craft breweries as non of the main brands appear in the top 20. For the 
lowest rated companies, they all have 0 ratings.
'''

Top 20 Company
company
Broomtail Craft Brewery                                       4.820000
Prairie Sun Brewery                                           4.770000
Bierkeller                                                    4.720000
Thunder Island Brewing                                        4.670000
The Alchemist Brewery and Visitors Center                     4.630484
Faustino \xe2\x80\x93 Microcervejeira, Lda.                   4.600000
Fixed Wheel Brewery                                           4.600000
Brasserie du Haut-Bu\xc3\xabch                                4.600000
Lawson's Finest Liquids                                       4.590990
Tree House Brewing Company                                    4.552759
Brouwerij Westvleteren (Sint-Sixtusabdij van Westvleteren)    4.534042
Noble Order Brewing Company                                   4.490000
Lovibonds Brewery Ltd                                         4.490000
Side Project Brewing                                  

'\nThe top rate companies all appear to be craft breweries as non of the main brands appear in the top 20. For the \nlowest rated companies, they all have 0 ratings.\n'