In [1]:
import pandas as pd
import json

In [2]:
# Read the beer review dataset into a DataFrame
df = pd.read_csv('../../data/beer_reviews.csv')

In [11]:
print(len(df))
display(df.head())

722209


Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
10,163,Amstel Brouwerij B. V.,1010963392,3.0,2.0,3.0,fodeeoz,Light Lager,2.5,2.5,Amstel Light,3.5,436
18,163,Amstel Brouwerij B. V.,1010861086,2.5,3.0,3.0,jdhilt,Light Lager,2.0,2.0,Amstel Light,3.5,436
30,163,Amstel Brouwerij B. V.,1002109880,3.0,2.0,2.0,xXTequila,Light Lager,2.0,3.0,Amstel Light,3.5,436
40,163,Amstel Brouwerij B. V.,988202869,3.0,3.0,3.0,Brent,Light Lager,2.0,2.0,Amstel Light,3.5,436
257,1075,Caldera Brewing Company,1272945129,4.0,4.0,4.0,Akfan,American IPA,4.0,4.5,Caldera IPA,6.1,10784


In [4]:
# Count total number of times each beer id appears in df
beer_counts = df['beer_beerid'].value_counts()

min_reviews = 350
print(f"Total number of beers with more than {min_reviews} reviews: {len(beer_counts)-sum(beer_counts < min_reviews)}")

Total number of beers with more than 350 reviews: 964


In [5]:
# Drop the beers with less than 10 reviews from the df
df = df[df['beer_beerid'].isin(beer_counts[beer_counts >= min_reviews].index)]
print(f"Total reviews after dropping unpopular beers: {len(df)}")

Total reviews after dropping unpopular beers: 722209


In [7]:
user_reviews = {}

# Extract all of the reviews for each user
for i, row in df.iterrows():
    user = row['review_profilename']
    beerid = row['beer_beerid']
    review = row['review_overall']
    if user and user not in user_reviews:
        user_reviews[user] = {}
    user_reviews[user][beerid] = review

# Save to json
with open('../../data/user_reviews.json', 'w') as f:
    json.dump(user_reviews, f)


In [8]:
unique_beers = df['beer_beerid'].unique()
unique_beers = sorted(unique_beers)
print(unique_beers)

# Create beer id map to index
beer_id_map = {}
for i, beer_id in enumerate(unique_beers):
    beer_id_map[int(beer_id)] = i

# Save to json for shared used
with open('../../data/beerid_to_index_map.json', 'w') as f:
    json.dump(beer_id_map, f)

[5, 6, 7, 10, 17, 19, 30, 31, 33, 34, 36, 39, 58, 59, 61, 63, 65, 73, 74, 79, 83, 85, 87, 88, 90, 92, 96, 99, 100, 101, 102, 103, 104, 108, 111, 115, 117, 129, 131, 132, 133, 134, 135, 138, 139, 140, 141, 142, 146, 147, 148, 155, 156, 159, 171, 184, 185, 186, 195, 197, 198, 199, 204, 205, 206, 213, 214, 217, 219, 221, 222, 224, 225, 226, 228, 229, 232, 234, 236, 243, 245, 246, 248, 262, 263, 270, 273, 276, 279, 280, 282, 283, 293, 296, 298, 299, 310, 311, 313, 314, 318, 321, 332, 353, 354, 355, 356, 358, 361, 363, 386, 387, 388, 402, 403, 408, 409, 410, 411, 412, 429, 431, 434, 436, 438, 448, 449, 459, 560, 567, 570, 571, 572, 573, 575, 576, 577, 580, 582, 593, 598, 599, 600, 607, 611, 615, 620, 622, 623, 626, 631, 635, 637, 639, 641, 643, 645, 646, 650, 652, 656, 658, 665, 667, 670, 672, 673, 674, 680, 689, 693, 694, 695, 700, 702, 703, 705, 706, 709, 712, 717, 718, 722, 727, 731, 751, 752, 754, 759, 760, 767, 772, 773, 776, 779, 782, 788, 793, 794, 808, 811, 813, 817, 832, 833, 836, 

In [9]:
user_data = {}
total_beers = len(unique_beers)
print(f"Total number of beers: {total_beers}")

# For each user, save their review vector to a json file
for user, reviews in user_reviews.items():
    review_vector = [0] * total_beers
    for beer_id, review in reviews.items():
        review_vector[beer_id_map[beer_id]] = review

    user_data[user] = review_vector

# Save to json
with open('../../data/user_review_vectors.json', 'w') as f:
    json.dump(user_data, f)

    

Total number of beers: 964


In [10]:
# Count unique breweries after dropping unpopular beers
unique_breweries = df['brewery_id'].unique()
print(f"Total number of breweries: {len(unique_breweries)}")


Total number of breweries: 253
