In [233]:
import pandas as pd
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas
from geopy.geocoders import Nominatim

## Geotagging the scraped Data

In [234]:
df = pd.read_csv('all_reviews.csv')
locations_df = pd.read_csv('locations_geotagged.csv')
beer_df = pd.read_csv('beer_deets.csv')

In [235]:
df

Unnamed: 0,beer_name,beer_rating,brewery_location,brewery_name,review_content,reviewer_name
0,St. Peter's Ruby Red Ale,3.3,"Bungay, Suffolk",St. Peter's Brewery,"Ruby color, offwhite coam. Aroma and taste: fl...",alex_leit
1,Shepherd Neame Spitfire (Bottle),2.2,"Faversham, Kent",Shepherd Neame,Disappointing show from the lads at ‘Britain’s...,shivermetimber.gray
2,Arbor Rocketman,3.8,"Bristol, Gloucestershire",Arbor Ales,"Puszka. Mętne o żółtym kolorze. Piana biała,ob...",MarcinG
3,Shepherd Neame India Pale Ale (Bottle),3.4,"Faversham, Kent",Shepherd Neame,Bottle. Amber with bronze color and white foam...,alex_leit
4,Shepherd Neame Christmas Ale,3.3,"Faversham, Kent",Shepherd Neame,Bottle. Aroma: English hops and malts with tof...,alex_leit
...,...,...,...,...,...,...
24191,Bowman Nutz,1.5,-1.5,Bowman Ales,"500ml bottle in 2015. Clear chestnut, with no ...",Beese
24192,Dogfish Head 120 Minute IPA,1.5,-2.4,Dogfish Head Brewery (Boston Beer Co.),"0,3 litre Bottle from Total Wine And More, Lag...",ChrisOfstad
24193,Lagunitas Newcastle Brown Ale (US version),1.0,-1.7,Lagunitas Brewing Company (Heineken),"Bottle from BevMo, Oceanside, Ca. Why is this ...",ChrisOfstad
24194,Samuel Smiths India Ale,1.8,-1.6,Samuel Smith,"Bought Nov 8, 2019, at 'Gastronom', Izhevsk, R...",PastorGL


In [236]:
locations_df

Unnamed: 0,locations,latitude,longitude,country
0,"Bungay, Suffolk",52.456225,1.437341,United Kingdom
1,"Faversham, Kent",51.314409,0.891189,United Kingdom
2,"Bristol, Gloucestershire",51.852767,-2.255369,United Kingdom
3,"Round Rock, Texas",30.508235,-97.678893,United States of America
4,"Brooklyn, New York",40.650104,-73.949582,United States of America
...,...,...,...,...
2180,"Hemsworth, West Yorkshire",53.613163,-1.360180,United Kingdom
2181,"O’Fallon, Missouri",38.791037,-90.703911,United States of America
2182,"Firestone, Colorado",40.112484,-104.936644,United States of America
2183,"Winter Park, Florida",28.597771,-81.351026,United States of America


In [237]:
geolocator = Nominatim(user_agent="None")

In [238]:
locations_new = pd.DataFrame(df.brewery_location[~df.brewery_location.isin(locations_df.locations)].unique(),columns=['locations'])
print(len(locations_new), "new locations to search for! Will take", len(locations_new)/2, "seconds to run.")

29 new locations to search for! Will take 14.5 seconds to run.


In [239]:
def tryconvert(x):
        try:
            return geolocator.geocode(x).latitude
        except:
            return None
    

In [240]:
def tryconvertlong(x):
        try:
            return geolocator.geocode(x).longitude
        except:
            return None

In [241]:
locations_new["latitude"] = locations_new.locations.apply(lambda x: tryconvert(x))

In [242]:
locations_new["longitude"] = locations_new.locations.apply(lambda x: tryconvertlong(x))

In [243]:
countries = []
for row in locations_new.index:
    try:
        loc = geolocator.reverse([locations_new.iloc[row,1],locations_new.iloc[row,2]], language='en')
        countries.append(loc.raw['address']['country'])
    except:
        countries.append(None)
        
    

In [244]:
locations_new['country'] = countries

In [245]:
locations_new

Unnamed: 0,locations,latitude,longitude,country
0,"Royston, Hertfordshire",52.047274,-0.024647,United Kingdom
1,"Buttenheim, Bavaria",49.802517,11.029044,Germany
2,"San Clemente, California",33.427028,-117.612418,United States of America
3,"Aitkin, Minnesota",46.571482,-93.38476,United States of America
4,"Valemount, British Columbia",52.831512,-119.280579,Canada
5,"Phoenixville, Pennsylvania",40.130382,-75.514913,United States of America
6,"Chelan, Washington",47.84097,-120.016459,United States of America
7,"New Canaan, Connecticut",41.146763,-73.494845,United States of America
8,"Topsham, Maine",43.92758,-69.975884,United States of America
9,"Victor, Montana",46.414816,-114.1494,United States of America


In [246]:
locations_df = pd.concat([locations_df,locations_new])
locations_df.to_csv('locations_geotagged.csv',index=False)

## Merging and cleaning

In [247]:
beer_df

Unnamed: 0,return_string,flag,type,abv,rb_overall_score,rb_style_score,rb_user_rating,rb_rating_number,name_found
0,St. Peter's Ruby Red Ale\n🇬🇧 Bitter - Ordinary...,🇬🇧,Bitter - Ordinary / Best,4.3,49,95,3.25,415.0,St. Peter's Ruby Red Ale
1,Shepherd Neame Spitfire (Bottle)\n🇬🇧 Bitter - ...,🇬🇧,Bitter - Ordinary / Best,4.5,34,59,3.05,1083.0,Shepherd Neame Spitfire (Bottle)
2,Arbor Rocketman\n🇬🇧 IPA • 6.0%\n95\n97\n3.74\n...,🇬🇧,IPA,6.0,95,97,3.74,59.0,Arbor Rocketman
3,Shepherd Neame India Pale Ale (Bottle)\n🇬🇧 IPA...,🇬🇧,IPA - English,6.1,46,47,3.22,420.0,Shepherd Neame India Pale Ale (Bottle)
4,Shepherd Neame Christmas Ale\n🇬🇧 Strong Ale - ...,🇬🇧,Strong Ale - English,7.0,45,46,3.18,409.0,Shepherd Neame Christmas Ale
...,...,...,...,...,...,...,...,...,...
13179,Coniston Olivers Light Ale\n🇬🇧 Blonde Ale / Go...,🇬🇧,Blonde Ale / Golden Ale,3.4,35,54,3.05,29.0,Coniston Olivers Light Ale
13180,"Coal Creek TAP ""Hey Victor"" Smoked Porter\n🇺🇸 ...",🇺🇸,Porter - Smoked,5.5,-,-,3.35,2.0,"Coal Creek TAP ""Hey Victor"" Smoked Porter"
13181,Bright Eye Betty White IPA\n🇨🇦 IPA - White\n-\...,🇨🇦,IPA - White,,-,-,3.40,1.0,Bright Eye Betty White IPA
13182,Sanitas View From Above IPA\n🇺🇸 IPA\n-\n-\n1.3...,🇺🇸,IPA,,-,-,1.30,1.0,Sanitas View From Above IPA


In [248]:
# Creating DF with all details
df_all = pd.merge(df,locations_df,how='left',left_on='brewery_location',right_on='locations')
df_all = pd.merge(df_all, beer_df, how='left', left_on='beer_name',right_on='name_found')
df_all


Unnamed: 0,beer_name,beer_rating,brewery_location,brewery_name,review_content,reviewer_name,locations,latitude,longitude,country,return_string,flag,type,abv,rb_overall_score,rb_style_score,rb_user_rating,rb_rating_number,name_found
0,St. Peter's Ruby Red Ale,3.3,"Bungay, Suffolk",St. Peter's Brewery,"Ruby color, offwhite coam. Aroma and taste: fl...",alex_leit,"Bungay, Suffolk",52.456225,1.437341,United Kingdom,St. Peter's Ruby Red Ale\n🇬🇧 Bitter - Ordinary...,🇬🇧,Bitter - Ordinary / Best,4.3,49,95,3.25,415.0,St. Peter's Ruby Red Ale
1,Shepherd Neame Spitfire (Bottle),2.2,"Faversham, Kent",Shepherd Neame,Disappointing show from the lads at ‘Britain’s...,shivermetimber.gray,"Faversham, Kent",51.314409,0.891189,United Kingdom,Shepherd Neame Spitfire (Bottle)\n🇬🇧 Bitter - ...,🇬🇧,Bitter - Ordinary / Best,4.5,34,59,3.05,1083.0,Shepherd Neame Spitfire (Bottle)
2,Arbor Rocketman,3.8,"Bristol, Gloucestershire",Arbor Ales,"Puszka. Mętne o żółtym kolorze. Piana biała,ob...",MarcinG,"Bristol, Gloucestershire",51.852767,-2.255369,United Kingdom,Arbor Rocketman\n🇬🇧 IPA • 6.0%\n95\n97\n3.74\n...,🇬🇧,IPA,6.0,95,97,3.74,59.0,Arbor Rocketman
3,Shepherd Neame India Pale Ale (Bottle),3.4,"Faversham, Kent",Shepherd Neame,Bottle. Amber with bronze color and white foam...,alex_leit,"Faversham, Kent",51.314409,0.891189,United Kingdom,Shepherd Neame India Pale Ale (Bottle)\n🇬🇧 IPA...,🇬🇧,IPA - English,6.1,46,47,3.22,420.0,Shepherd Neame India Pale Ale (Bottle)
4,Shepherd Neame Christmas Ale,3.3,"Faversham, Kent",Shepherd Neame,Bottle. Aroma: English hops and malts with tof...,alex_leit,"Faversham, Kent",51.314409,0.891189,United Kingdom,Shepherd Neame Christmas Ale\n🇬🇧 Strong Ale - ...,🇬🇧,Strong Ale - English,7.0,45,46,3.18,409.0,Shepherd Neame Christmas Ale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24433,Bowman Nutz,1.5,-1.5,Bowman Ales,"500ml bottle in 2015. Clear chestnut, with no ...",Beese,-1.5,52.583680,13.276598,Germany,Bowman Nutz\n🇬🇧 Bitter - Premium / Strong / ES...,🇬🇧,Bitter - Premium / Strong / ESB,5.0,36,44,3.02,19.0,Bowman Nutz
24434,Dogfish Head 120 Minute IPA,1.5,-2.4,Dogfish Head Brewery (Boston Beer Co.),"0,3 litre Bottle from Total Wine And More, Lag...",ChrisOfstad,-2.4,48.800511,9.991238,Germany,Dogfish Head 120 Minute IPA\n🇺🇸 IIPA DIPA - Im...,🇺🇸,IIPA DIPA - Imperial / Double IPA,16.5,99,98,3.93,2491.0,Dogfish Head 120 Minute IPA
24435,Lagunitas Newcastle Brown Ale (US version),1.0,-1.7,Lagunitas Brewing Company (Heineken),"Bottle from BevMo, Oceanside, Ca. Why is this ...",ChrisOfstad,-1.7,48.780313,10.039726,Germany,Lagunitas Newcastle Brown Ale (US version)\n🇺🇸...,🇺🇸,Brown Ale,4.7,15,9,1.47,10.0,Lagunitas Newcastle Brown Ale (US version)
24436,Samuel Smiths India Ale,1.8,-1.6,Samuel Smith,"Bought Nov 8, 2019, at 'Gastronom', Izhevsk, R...",PastorGL,-1.6,48.718182,9.989251,Germany,Samuel Smiths India Ale\n🇬🇧 Bitter - Premium /...,🇬🇧,Bitter - Premium / Strong / ESB,5.0,71,96,3.39,1694.0,Samuel Smiths India Ale


In [249]:
# drop excess columns
df_all.drop(["locations","return_string","name_found"],axis=1,inplace=True)
# drop rows with ratings instead of location
df_all = df_all[~df_all['brewery_location'].str.len()<4]
# drop all NaNs
print(len(df_all) - len(df_all.dropna()), "columns with NaNs dropped.")
df_all.dropna(inplace=True)


1779 columns with NaNs dropped.


In [250]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22659 entries, 0 to 24437
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   beer_name         22659 non-null  object 
 1   beer_rating       22659 non-null  float64
 2   brewery_location  22659 non-null  object 
 3   brewery_name      22659 non-null  object 
 4   review_content    22659 non-null  object 
 5   reviewer_name     22659 non-null  object 
 6   latitude          22659 non-null  float64
 7   longitude         22659 non-null  float64
 8   country           22659 non-null  object 
 9   flag              22659 non-null  object 
 10  type              22659 non-null  object 
 11  abv               22659 non-null  float64
 12  rb_overall_score  22659 non-null  object 
 13  rb_style_score    22659 non-null  object 
 14  rb_user_rating    22659 non-null  object 
 15  rb_rating_number  22659 non-null  float64
dtypes: float64(5), object(11)
memory usage: 

## Deal with Non-English Descriptions

## Save down cleaned data

In [251]:
df_all.to_csv("data_clean.csv",index=False)