In [153]:
import pandas as pd
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas
from geopy.geocoders import Nominatim

## Geotagging the scraped Data

In [154]:
df = pd.read_csv('all_reviews.csv')
locations_df = pd.read_csv('locations_geotagged.csv')
beer_df = pd.read_csv('beer_deets.csv')

In [155]:
df

Unnamed: 0,beer_name,beer_rating,brewery_location,brewery_name,review_content,reviewer_name
0,St. Peter's Ruby Red Ale,3.3,"Bungay, Suffolk",St. Peter's Brewery,"Ruby color, offwhite coam. Aroma and taste: fl...",alex_leit
1,Shepherd Neame Spitfire (Bottle),2.2,"Faversham, Kent",Shepherd Neame,Disappointing show from the lads at â€˜Britainâ€™s...,shivermetimber.gray
2,Arbor Rocketman,3.8,"Bristol, Gloucestershire",Arbor Ales,"Puszka. MÄ™tne o Å¼Ã³Å‚tym kolorze. Piana biaÅ‚a,ob...",MarcinG
3,Shepherd Neame India Pale Ale (Bottle),3.4,"Faversham, Kent",Shepherd Neame,Bottle. Amber with bronze color and white foam...,alex_leit
4,Shepherd Neame Christmas Ale,3.3,"Faversham, Kent",Shepherd Neame,Bottle. Aroma: English hops and malts with tof...,alex_leit
...,...,...,...,...,...,...
32278,The Duck-Rabbit Brown Ale,1.2,-2,The Duck-Rabbit Craft Brewery,"Got as a gift 6-pack. Sour, thin, over-carbona...",Intladvr1
32279,Miller Genuine Draft (MGD),4.4,+2.8,Miller Brewing Company (Molson Coors),"Great hot day refreshing lager,goes down smoot...",wootwoot83
32280,Gritty McDuffs Lions Pride Brown Ale (Best Bro...,5.0,+2.2,Gritty McDuffs,Very good brown ale. Big surprise to read the ...,jfsebastian
32281,New Holland Dragon's Milk - White,1.5,-1.8,New Holland Brewing Company,"Can from Total Wine in Sunnyvale, CA. Aroma is...",tightslice


In [156]:
locations_df

Unnamed: 0,locations,latitude,longitude,country
0,"Bungay, Suffolk",52.456225,1.437341,United Kingdom
1,"Faversham, Kent",51.314409,0.891189,United Kingdom
2,"Bristol, Gloucestershire",51.852767,-2.255369,United Kingdom
3,"Round Rock, Texas",30.508235,-97.678893,United States of America
4,"Brooklyn, New York",40.650104,-73.949582,United States of America
...,...,...,...,...
2481,"Rotherham, South Yorkshire",53.431042,-1.355187,United Kingdom
2482,"Penn Yann, New York",,,
2483,"Tortworth, Gloucestershire",51.636745,-2.431266,United Kingdom
2484,"Merzig, Saarland",49.442702,6.637490,Germany


In [157]:
geolocator = Nominatim(user_agent="None")

In [158]:
locations_new = pd.DataFrame(df.brewery_location[~df.brewery_location.isin(locations_df.locations)].unique(),columns=['locations'])
print(len(locations_new), "new locations to search for! Will take", len(locations_new)/2, "seconds to run.")

31 new locations to search for! Will take 15.5 seconds to run.


In [159]:
def tryconvert(x):
        try:
            return geolocator.geocode(x).latitude
        except:
            return None
    

In [160]:
def tryconvertlong(x):
        try:
            return geolocator.geocode(x).longitude
        except:
            return None

In [161]:
locations_new["latitude"] = locations_new.locations.apply(lambda x: tryconvert(x))

In [162]:
locations_new["longitude"] = locations_new.locations.apply(lambda x: tryconvertlong(x))

In [163]:
countries = []
for row in locations_new.index:
    try:
        loc = geolocator.reverse([locations_new.iloc[row,1],locations_new.iloc[row,2]], language='en')
        countries.append(loc.raw['address']['country'])
    except:
        countries.append(None)
        
    

In [164]:
locations_new['country'] = countries

In [165]:
locations_new

Unnamed: 0,locations,latitude,longitude,country
0,"Huntsville, Alabama",34.729847,-86.585901,United States of America
1,"Kingsport, Tennessee",36.550238,-82.559429,United States of America
2,"Hammondsport, New York",42.407849,-77.223587,United States of America
3,"Wetteren, Belgium",51.00688,3.885549,Belgium
4,"Rock Island, Illinois",41.441179,-90.576614,United States of America
5,"Gaildorf, Baden-WÃ¼rttemberg",49.00026,9.769853,Germany
6,"Little Eaton, Derbyshire",52.968932,-1.461467,United Kingdom
7,"Workington, Cumbria",54.641572,-3.545929,United Kingdom
8,"Schweppenhausen, Rhineland-Palatinate",49.925908,7.800547,Germany
9,"Boca Raton , Florida",26.358688,-80.083098,United States of America


In [166]:
locations_df = pd.concat([locations_df,locations_new])
locations_df.to_csv('locations_geotagged.csv',index=False)

## Merging and cleaning

In [167]:
beer_df

Unnamed: 0,return_string,flag,type,abv,rb_overall_score,rb_style_score,rb_user_rating,rb_rating_number,name_found
0,St. Peter's Ruby Red Ale\nðŸ‡¬ðŸ‡§ Bitter - Ordinary...,ðŸ‡¬ðŸ‡§,Bitter - Ordinary / Best,4.3,49,95,3.25,415.0,St. Peter's Ruby Red Ale
1,Shepherd Neame Spitfire (Bottle)\nðŸ‡¬ðŸ‡§ Bitter - ...,ðŸ‡¬ðŸ‡§,Bitter - Ordinary / Best,4.5,34,59,3.05,1083.0,Shepherd Neame Spitfire (Bottle)
2,Arbor Rocketman\nðŸ‡¬ðŸ‡§ IPA â€¢ 6.0%\n95\n97\n3.74\n...,ðŸ‡¬ðŸ‡§,IPA,6.0,95,97,3.74,59.0,Arbor Rocketman
3,Shepherd Neame India Pale Ale (Bottle)\nðŸ‡¬ðŸ‡§ IPA...,ðŸ‡¬ðŸ‡§,IPA - English,6.1,46,47,3.22,420.0,Shepherd Neame India Pale Ale (Bottle)
4,Shepherd Neame Christmas Ale\nðŸ‡¬ðŸ‡§ Strong Ale - ...,ðŸ‡¬ðŸ‡§,Strong Ale - English,7.0,45,46,3.18,409.0,Shepherd Neame Christmas Ale
...,...,...,...,...,...,...,...,...,...
17143,Fort Orange Galactic Explosion\nðŸ‡ºðŸ‡¸ IPA - Hazy ...,ðŸ‡ºðŸ‡¸,IPA - Hazy / NEIPA,6.3,-,-,4.00,1.0,Fort Orange Galactic Explosion
17144,Source Single Silo Series: Ultra Mosaic\nðŸ‡ºðŸ‡¸ II...,ðŸ‡ºðŸ‡¸,IIPA DIPA - Hazy / Double NEIPA,8.0,-,-,4.20,1.0,Source Single Silo Series: Ultra Mosaic
17145,Yellowhammer Coffee Imperial Stout\nðŸ‡ºðŸ‡¸ Stout -...,ðŸ‡ºðŸ‡¸,Stout - Imperial,8.0,-,-,3.10,1.0,Yellowhammer Coffee Imperial Stout
17146,Anyday ROSÃ‰\nðŸ‡ºðŸ‡¸ Apple Cider - Hopped / Malted ...,ðŸ‡ºðŸ‡¸,Apple Cider - Hopped / Malted / Graf,,-,-,2.50,1.0,Anyday ROSÃ‰


In [168]:
# deal with the price issue
price_df = beer_df[beer_df.rb_overall_score.str.contains('Available')].copy()
beer_df = beer_df[~beer_df.rb_overall_score.str.contains('Available')]
price_df['rb_overall_score'] = price_df.return_string.apply(lambda x: x.split('\n')[3])
price_df['rb_style_score'] = price_df.return_string.apply(lambda x: x.split('\n')[4])
price_df['rb_user_rating'] = price_df.return_string.apply(lambda x: x.split('\n')[5])
price_df['rb_rating_number'] = price_df.return_string.apply(lambda x: x.split('\n')[6].replace("(","").replace(")",""))
beer_df = pd.concat([beer_df,price_df])
print("Number of issue beers sorted:", len(price_df))


Number of issue beers sorted: 291


In [169]:
# set the types to float (cant do for the overall score and rating as there are two many None "-")
beer_df.rb_user_rating = beer_df.rb_user_rating.astype(float)
beer_df.rb_rating_number = beer_df.rb_rating_number.astype(float)

In [170]:
# Creating DF with all details
df_all = pd.merge(df,locations_df,how='left',left_on='brewery_location',right_on='locations')
df_all = pd.merge(df_all, beer_df, how='left', left_on='beer_name',right_on='name_found')
df_all


Unnamed: 0,beer_name,beer_rating,brewery_location,brewery_name,review_content,reviewer_name,locations,latitude,longitude,country,return_string,flag,type,abv,rb_overall_score,rb_style_score,rb_user_rating,rb_rating_number,name_found
0,St. Peter's Ruby Red Ale,3.3,"Bungay, Suffolk",St. Peter's Brewery,"Ruby color, offwhite coam. Aroma and taste: fl...",alex_leit,"Bungay, Suffolk",52.456225,1.437341,United Kingdom,St. Peter's Ruby Red Ale\nðŸ‡¬ðŸ‡§ Bitter - Ordinary...,ðŸ‡¬ðŸ‡§,Bitter - Ordinary / Best,4.3,49,95,3.25,415.0,St. Peter's Ruby Red Ale
1,Shepherd Neame Spitfire (Bottle),2.2,"Faversham, Kent",Shepherd Neame,Disappointing show from the lads at â€˜Britainâ€™s...,shivermetimber.gray,"Faversham, Kent",51.314409,0.891189,United Kingdom,Shepherd Neame Spitfire (Bottle)\nðŸ‡¬ðŸ‡§ Bitter - ...,ðŸ‡¬ðŸ‡§,Bitter - Ordinary / Best,4.5,34,59,3.05,1083.0,Shepherd Neame Spitfire (Bottle)
2,Arbor Rocketman,3.8,"Bristol, Gloucestershire",Arbor Ales,"Puszka. MÄ™tne o Å¼Ã³Å‚tym kolorze. Piana biaÅ‚a,ob...",MarcinG,"Bristol, Gloucestershire",51.852767,-2.255369,United Kingdom,Arbor Rocketman\nðŸ‡¬ðŸ‡§ IPA â€¢ 6.0%\n95\n97\n3.74\n...,ðŸ‡¬ðŸ‡§,IPA,6.0,95,97,3.74,59.0,Arbor Rocketman
3,Shepherd Neame India Pale Ale (Bottle),3.4,"Faversham, Kent",Shepherd Neame,Bottle. Amber with bronze color and white foam...,alex_leit,"Faversham, Kent",51.314409,0.891189,United Kingdom,Shepherd Neame India Pale Ale (Bottle)\nðŸ‡¬ðŸ‡§ IPA...,ðŸ‡¬ðŸ‡§,IPA - English,6.1,46,47,3.22,420.0,Shepherd Neame India Pale Ale (Bottle)
4,Shepherd Neame Christmas Ale,3.3,"Faversham, Kent",Shepherd Neame,Bottle. Aroma: English hops and malts with tof...,alex_leit,"Faversham, Kent",51.314409,0.891189,United Kingdom,Shepherd Neame Christmas Ale\nðŸ‡¬ðŸ‡§ Strong Ale - ...,ðŸ‡¬ðŸ‡§,Strong Ale - English,7.0,45,46,3.18,409.0,Shepherd Neame Christmas Ale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32590,The Duck-Rabbit Brown Ale,1.2,-2,The Duck-Rabbit Craft Brewery,"Got as a gift 6-pack. Sour, thin, over-carbona...",Intladvr1,-2,-23.604131,-69.084278,Chile,The Duck-Rabbit Brown Ale\nðŸ‡ºðŸ‡¸ Brown Ale â€¢ 5.6%...,ðŸ‡ºðŸ‡¸,Brown Ale,5.6,49,64,3.24,514.0,The Duck-Rabbit Brown Ale
32591,Miller Genuine Draft (MGD),4.4,+2.8,Miller Brewing Company (Molson Coors),"Great hot day refreshing lager,goes down smoot...",wootwoot83,+2.8,54.379170,10.982190,Germany,Miller Genuine Draft (MGD)\nðŸ‡ºðŸ‡¸ Pale Lager - No...,ðŸ‡ºðŸ‡¸,Pale Lager - North Am. / Light / Rice,4.7,0,7,1.82,2199.0,Miller Genuine Draft (MGD)
32592,Gritty McDuffs Lions Pride Brown Ale (Best Bro...,5.0,+2.2,Gritty McDuffs,Very good brown ale. Big surprise to read the ...,jfsebastian,+2.2,48.793281,9.967117,Germany,Gritty McDuffs Lions Pride Brown Ale (Best Bro...,ðŸ‡ºðŸ‡¸,Brown Ale,4.1,19,15,2.81,130.0,Gritty McDuffs Lions Pride Brown Ale (Best Bro...
32593,New Holland Dragon's Milk - White,1.5,-1.8,New Holland Brewing Company,"Can from Total Wine in Sunnyvale, CA. Aroma is...",tightslice,-1.8,48.780981,10.037026,Germany,New Holland Dragon's Milk - White\nðŸ‡ºðŸ‡¸ Stout â€¢ ...,ðŸ‡ºðŸ‡¸,Stout,6.0,65,60,3.38,77.0,New Holland Dragon's Milk - White


In [171]:
# drop excess columns
df_all.drop(["locations","return_string","name_found"],axis=1,inplace=True)
# drop rows with ratings instead of location
print(len(df_all[df_all['brewery_location'].str.len()<4]),"columns without locations dropped.")
df_all = df_all[~df_all['brewery_location'].str.len()<4]
# drop all NaNs
print(len(df_all) - len(df_all.dropna()), "columns with NaNs dropped.")
df_all.dropna(inplace=True)


221 columns without locations dropped.
2436 columns with NaNs dropped.


In [172]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30159 entries, 0 to 32594
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   beer_name         30159 non-null  object 
 1   beer_rating       30159 non-null  float64
 2   brewery_location  30159 non-null  object 
 3   brewery_name      30159 non-null  object 
 4   review_content    30159 non-null  object 
 5   reviewer_name     30159 non-null  object 
 6   latitude          30159 non-null  float64
 7   longitude         30159 non-null  float64
 8   country           30159 non-null  object 
 9   flag              30159 non-null  object 
 10  type              30159 non-null  object 
 11  abv               30159 non-null  float64
 12  rb_overall_score  30159 non-null  object 
 13  rb_style_score    30159 non-null  object 
 14  rb_user_rating    30159 non-null  float64
 15  rb_rating_number  30159 non-null  float64
dtypes: float64(6), object(10)
memory usage: 

## Deal with Non-English Descriptions

## Save down cleaned data

In [173]:
df_all.to_csv("data_clean.csv",index=False)