In [1]:
import pandas as pd
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas
from geopy.geocoders import Nominatim

## Geotagging the scraped Data

In [2]:
df = pd.read_csv('all_reviews.csv')
locations_df = pd.read_csv('locations_geotagged.csv')
beer_df = pd.read_csv('beer_deets.csv')

In [3]:
df

Unnamed: 0,beer_name,beer_rating,brewery_location,brewery_name,review_content,reviewer_name
0,St. Peter's Ruby Red Ale,3.3,"Bungay, Suffolk",St. Peter's Brewery,"Ruby color, offwhite coam. Aroma and taste: fl...",alex_leit
1,Shepherd Neame Spitfire (Bottle),2.2,"Faversham, Kent",Shepherd Neame,Disappointing show from the lads at ‘Britain’s...,shivermetimber.gray
2,Arbor Rocketman,3.8,"Bristol, Gloucestershire",Arbor Ales,"Puszka. Mętne o żółtym kolorze. Piana biała,ob...",MarcinG
3,Shepherd Neame India Pale Ale (Bottle),3.4,"Faversham, Kent",Shepherd Neame,Bottle. Amber with bronze color and white foam...,alex_leit
4,Shepherd Neame Christmas Ale,3.3,"Faversham, Kent",Shepherd Neame,Bottle. Aroma: English hops and malts with tof...,alex_leit
...,...,...,...,...,...,...
49098,Oettinger Pils,0.5,-1.9,Oettinger Bier Gruppe,"pours pale straw color thin white head ,aroma ...",rat
49099,Goddards Ducks Folly,5.0,+1.9,Goddards Brewery,Superb strong ale on draught at The Golden Bow...,TrappistScout61
49100,Narragansett Lager Beer,5.0,+2.5,Narragansett Brewing Company,Smooth and refreshing on a hot summer day afte...,cah
49101,Evil Twin Even More Coco Jesus,2.3,-1.6,Evil Twin Brewing,"Can, early 2018. Soapy coconut, lots of molass...",ElDesmadre


In [4]:
locations_df

Unnamed: 0,locations,latitude,longitude,country
0,"Bungay, Suffolk",52.456225,1.437341,United Kingdom
1,"Faversham, Kent",51.314409,0.891189,United Kingdom
2,"Bristol, Gloucestershire",51.852767,-2.255369,United Kingdom
3,"Round Rock, Texas",30.508235,-97.678893,United States of America
4,"Brooklyn, New York",40.650104,-73.949582,United States of America
...,...,...,...,...
2512,"Humble, Texas",29.998831,-95.262155,United States of America
2513,"Acton, Ontario",43.634068,-80.035112,Canada
2514,"Orillia, Ontario",44.609206,-79.417559,Canada
2515,"Middletown, Ohio",39.515058,-84.398276,United States of America


In [5]:
geolocator = Nominatim(user_agent="None")

In [6]:
locations_new = pd.DataFrame(df.brewery_location[~df.brewery_location.isin(locations_df.locations)].unique(),columns=['locations'])
print(len(locations_new), "new locations to search for! Will take", len(locations_new)/2, "seconds to run.")

599 new locations to search for! Will take 299.5 seconds to run.


In [7]:
def tryconvert(x):
        try:
            return geolocator.geocode(x).latitude
        except:
            return None
    

In [8]:
def tryconvertlong(x):
        try:
            return geolocator.geocode(x).longitude
        except:
            return None

In [9]:
locations_new["latitude"] = locations_new.locations.apply(lambda x: tryconvert(x))

In [10]:
locations_new["longitude"] = locations_new.locations.apply(lambda x: tryconvertlong(x))

In [11]:
countries = []
for row in locations_new.index:
    try:
        loc = geolocator.reverse([locations_new.iloc[row,1],locations_new.iloc[row,2]], language='en')
        countries.append(loc.raw['address']['country'])
    except:
        countries.append(None)
        
    

In [12]:
locations_new['country'] = countries

In [13]:
locations_new

Unnamed: 0,locations,latitude,longitude,country
0,"Stirling, Ontario",44.296394,-77.548552,Canada
1,"Pottenstein, Bavaria",49.772184,11.408741,Germany
2,"Havelock, Ontario",44.433223,-77.887293,Canada
3,"Wassertrüdingen, Bavaria",49.040673,10.596943,Germany
4,"Coventry , West Midlands",52.408181,-1.510477,United Kingdom
...,...,...,...,...
594,"Dentlein am Forst, Bavaria",49.147567,10.423854,Germany
595,"Sault Ste. Marie, Ontario",46.523910,-84.320068,Canada
596,"Bochum, North Rhine-Westphalia",51.481811,7.219664,Germany
597,"Barth, Mecklenburg-Vorpommern",54.369055,12.725858,Germany


In [14]:
locations_df = pd.concat([locations_df,locations_new])
locations_df.to_csv('locations_geotagged.csv',index=False)

## Merging and cleaning

In [15]:
beer_df

Unnamed: 0,return_string,flag,type,abv,rb_overall_score,rb_style_score,rb_user_rating,rb_rating_number,name_found
0,St. Peter's Ruby Red Ale\n🇬🇧 Bitter - Ordinary...,🇬🇧,Bitter - Ordinary / Best,4.3,49,95,3.25,415.0,St. Peter's Ruby Red Ale
1,Shepherd Neame Spitfire (Bottle)\n🇬🇧 Bitter - ...,🇬🇧,Bitter - Ordinary / Best,4.5,34,59,3.05,1083.0,Shepherd Neame Spitfire (Bottle)
2,Arbor Rocketman\n🇬🇧 IPA • 6.0%\n95\n97\n3.74\n...,🇬🇧,IPA,6.0,95,97,3.74,59.0,Arbor Rocketman
3,Shepherd Neame India Pale Ale (Bottle)\n🇬🇧 IPA...,🇬🇧,IPA - English,6.1,46,47,3.22,420.0,Shepherd Neame India Pale Ale (Bottle)
4,Shepherd Neame Christmas Ale\n🇬🇧 Strong Ale - ...,🇬🇧,Strong Ale - English,7.0,45,46,3.18,409.0,Shepherd Neame Christmas Ale
...,...,...,...,...,...,...,...,...,...
25526,"Three Hills 49.7374° N, 13.3736° E\n🇬🇧 Pilsene...",🇬🇧,Pilsener - Czech / Svetlý,,-,-,3.50,1.0,"Three Hills 49.7374° N, 13.3736° E"
25527,450 North Slushy XL - Brunch Blackberry\n🇺🇸 Be...,🇺🇸,Berliner Weisse - Flavored,,-,-,3.90,3.0,450 North Slushy XL - Brunch Blackberry
25528,Gallicus Broue Ha Ha Decem\n🇨🇦 IPA\n-\n-\n3.60...,🇨🇦,IPA,,-,-,3.60,2.0,Gallicus Broue Ha Ha Decem
25529,Court Avenue Extra Pale Ale\n🇺🇸 Pale Ale - Ame...,🇺🇸,Pale Ale - American / APA,,-,-,3.30,1.0,Court Avenue Extra Pale Ale


In [16]:
# deal with the price issue
price_df = beer_df[beer_df.rb_overall_score.str.contains('Available')].copy()
beer_df = beer_df[~beer_df.rb_overall_score.str.contains('Available')]
price_df['rb_overall_score'] = price_df.return_string.apply(lambda x: x.split('\n')[3])
price_df['rb_style_score'] = price_df.return_string.apply(lambda x: x.split('\n')[4])
price_df['rb_user_rating'] = price_df.return_string.apply(lambda x: x.split('\n')[5])
price_df['rb_rating_number'] = price_df.return_string.apply(lambda x: x.split('\n')[6].replace("(","").replace(")",""))
beer_df = pd.concat([beer_df,price_df])
print("Number of issue beers sorted:", len(price_df))


Number of issue beers sorted: 338


In [17]:
# set the types to float (cant do for the overall score and rating as there are two many None "-")
beer_df.rb_user_rating = beer_df.rb_user_rating.astype(float)
beer_df.rb_rating_number = beer_df.rb_rating_number.astype(float)

In [18]:
# Creating DF with all details
df_all = pd.merge(df,locations_df,how='left',left_on='brewery_location',right_on='locations')
df_all = pd.merge(df_all, beer_df, how='left', left_on='beer_name',right_on='name_found')
df_all


Unnamed: 0,beer_name,beer_rating,brewery_location,brewery_name,review_content,reviewer_name,locations,latitude,longitude,country,return_string,flag,type,abv,rb_overall_score,rb_style_score,rb_user_rating,rb_rating_number,name_found
0,St. Peter's Ruby Red Ale,3.3,"Bungay, Suffolk",St. Peter's Brewery,"Ruby color, offwhite coam. Aroma and taste: fl...",alex_leit,"Bungay, Suffolk",52.456225,1.437341,United Kingdom,St. Peter's Ruby Red Ale\n🇬🇧 Bitter - Ordinary...,🇬🇧,Bitter - Ordinary / Best,4.3,49,95,3.25,415.0,St. Peter's Ruby Red Ale
1,Shepherd Neame Spitfire (Bottle),2.2,"Faversham, Kent",Shepherd Neame,Disappointing show from the lads at ‘Britain’s...,shivermetimber.gray,"Faversham, Kent",51.314409,0.891189,United Kingdom,Shepherd Neame Spitfire (Bottle)\n🇬🇧 Bitter - ...,🇬🇧,Bitter - Ordinary / Best,4.5,34,59,3.05,1083.0,Shepherd Neame Spitfire (Bottle)
2,Arbor Rocketman,3.8,"Bristol, Gloucestershire",Arbor Ales,"Puszka. Mętne o żółtym kolorze. Piana biała,ob...",MarcinG,"Bristol, Gloucestershire",51.852767,-2.255369,United Kingdom,Arbor Rocketman\n🇬🇧 IPA • 6.0%\n95\n97\n3.74\n...,🇬🇧,IPA,6.0,95,97,3.74,59.0,Arbor Rocketman
3,Shepherd Neame India Pale Ale (Bottle),3.4,"Faversham, Kent",Shepherd Neame,Bottle. Amber with bronze color and white foam...,alex_leit,"Faversham, Kent",51.314409,0.891189,United Kingdom,Shepherd Neame India Pale Ale (Bottle)\n🇬🇧 IPA...,🇬🇧,IPA - English,6.1,46,47,3.22,420.0,Shepherd Neame India Pale Ale (Bottle)
4,Shepherd Neame Christmas Ale,3.3,"Faversham, Kent",Shepherd Neame,Bottle. Aroma: English hops and malts with tof...,alex_leit,"Faversham, Kent",51.314409,0.891189,United Kingdom,Shepherd Neame Christmas Ale\n🇬🇧 Strong Ale - ...,🇬🇧,Strong Ale - English,7.0,45,46,3.18,409.0,Shepherd Neame Christmas Ale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49608,Oettinger Pils,0.5,-1.9,Oettinger Bier Gruppe,"pours pale straw color thin white head ,aroma ...",rat,-1.9,49.024169,9.028337,Germany,Oettinger Pils\n🇩🇪 Pilsener / Pils / Pilsner •...,🇩🇪,Pilsener / Pils / Pilsner,4.7,6,13,2.37,461.0,Oettinger Pils
49609,Goddards Ducks Folly,5.0,+1.9,Goddards Brewery,Superb strong ale on draught at The Golden Bow...,TrappistScout61,+1.9,49.024169,9.028337,Germany,Goddards Ducks Folly\n🇬🇧 Bitter - Premium / St...,🇬🇧,Bitter - Premium / Strong / ESB,5.2,40,49,3.12,65.0,Goddards Ducks Folly
49610,Narragansett Lager Beer,5.0,+2.5,Narragansett Brewing Company,Smooth and refreshing on a hot summer day afte...,cah,+2.5,50.814444,14.361805,Czech Republic,Narragansett Lager Beer\n🇺🇸 Pale Lager - North...,🇺🇸,Pale Lager - North Am. / Light / Rice,5.0,8,85,2.58,438.0,Narragansett Lager Beer
49611,Evil Twin Even More Coco Jesus,2.3,-1.6,Evil Twin Brewing,"Can, early 2018. Soapy coconut, lots of molass...",ElDesmadre,-1.6,48.718182,9.989251,Germany,Evil Twin Even More Coco Jesus\n🇺🇸 Stout - Imp...,🇺🇸,Stout - Imperial Flavored,12.0,99,82,3.91,414.0,Evil Twin Even More Coco Jesus


In [19]:
# drop excess columns
df_all.drop(["locations","return_string","name_found"],axis=1,inplace=True)
# drop rows with ratings instead of location
print(len(df_all[df_all['brewery_location'].str.len()<4]),"columns without locations dropped.")
df_all = df_all[~df_all['brewery_location'].str.len()<4]
# drop all NaNs
print(len(df_all) - len(df_all.dropna()), "columns with NaNs dropped.")
df_all.dropna(inplace=True)


237 columns without locations dropped.
3942 columns with NaNs dropped.


In [20]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45671 entries, 0 to 49612
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   beer_name         45671 non-null  object 
 1   beer_rating       45671 non-null  float64
 2   brewery_location  45671 non-null  object 
 3   brewery_name      45671 non-null  object 
 4   review_content    45671 non-null  object 
 5   reviewer_name     45671 non-null  object 
 6   latitude          45671 non-null  float64
 7   longitude         45671 non-null  float64
 8   country           45671 non-null  object 
 9   flag              45671 non-null  object 
 10  type              45671 non-null  object 
 11  abv               45671 non-null  float64
 12  rb_overall_score  45671 non-null  object 
 13  rb_style_score    45671 non-null  object 
 14  rb_user_rating    45671 non-null  float64
 15  rb_rating_number  45671 non-null  float64
dtypes: float64(6), object(10)
memory usage: 

## Deal with Non-English Descriptions

## Save down cleaned data

In [22]:
df_all.to_csv("data_clean.csv",index=False)