In [1]:
import os
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
datadir = "/home/karthiktiwari/Downloads/Beer Data"

In [3]:
beer_train = pd.read_csv(os.path.join(datadir,"beer_profile_and_ratings.csv"), index_col=0)
beer_train.reset_index(drop=True, inplace=True)

In [4]:
beer_train.head(5)

Unnamed: 0,Style,Brewery,Beer Name (Full),Description,ABV,Min IBU,Max IBU,Astringency,Body,Alcohol,...,Fruits,Hoppy,Spices,Malty,review_aroma,review_appearance,review_palate,review_taste,review_overall,number_of_reviews
0,Altbier,Alaskan Brewing Co.,Alaskan Brewing Co. Alaskan Amber,"Notes:Richly malty and long on the palate, wit...",5.3,25,50,13,32,9,...,33,57,8,111,3.498994,3.636821,3.556338,3.643863,3.847082,497
1,Altbier,Long Trail Brewing Co.,Long Trail Brewing Co. Double Bag,"Notes:This malty, full-bodied double alt is al...",7.2,25,50,12,57,18,...,24,35,12,84,3.798337,3.846154,3.904366,4.024948,4.034304,481
2,Altbier,Long Trail Brewing Co.,Long Trail Brewing Co. Long Trail Ale,Notes:Long Trail Ale is a full-bodied amber al...,5.0,25,50,14,37,6,...,10,54,4,62,3.409814,3.667109,3.600796,3.6313,3.830239,377
3,Altbier,Uerige Obergärige Hausbrauerei GmbH / Zum Uerige,Uerige Obergärige Hausbrauerei GmbH / Zum Ueri...,Notes:,8.5,25,50,13,55,31,...,49,40,16,119,4.148098,4.033967,4.150815,4.205163,4.005435,368
4,Altbier,Ninkasi Brewing Company,Ninkasi Brewing Company Sleigh'r Dark Doüble A...,Notes:Called 'Dark Double Alt' on the label.Se...,7.2,25,50,25,51,26,...,11,51,20,95,3.625,3.973958,3.734375,3.765625,3.817708,96


In [5]:
beer_train.columns

Index(['Style', 'Brewery', 'Beer Name (Full)', 'Description', 'ABV', 'Min IBU',
       'Max IBU', 'Astringency', 'Body', 'Alcohol', 'Bitter', 'Sweet', 'Sour',
       'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty', 'review_aroma',
       'review_appearance', 'review_palate', 'review_taste', 'review_overall',
       'number_of_reviews'],
      dtype='object')

In [6]:
beer_train[beer_train['ABV']>57]

Unnamed: 0,Style,Brewery,Beer Name (Full),Description,ABV,Min IBU,Max IBU,Astringency,Body,Alcohol,...,Fruits,Hoppy,Spices,Malty,review_aroma,review_appearance,review_palate,review_taste,review_overall,number_of_reviews
299,Bock - Eisbock,Schorschbräu,Schorschbräu Schorschbock 57%,Notes:,57.5,25,35,0,7,17,...,10,0,2,4,4.0,4.0,4.0,3.5,4.0,1


In [7]:
class BeerDataset(Dataset):
    def __init__(self, df, config):
        """
        Custom dataset class for reading beer attributes and review score from dataframe

        Args:
            df (Pandas DataFrame): Pandas Dataframe with Beer Data
        """
        self.df = df
        self.config = config

    def __getitem__(self, index):
        beer_attrs = []
        for attr in self.config:
            beer_attrs.append(self.df[attr].iloc[index])
        score = torch.as_tensor(np.round(self.df['review_overall'].iloc[index]).item()-1, dtype=torch.long)

        return {"attrs": torch.as_tensor(beer_attrs, dtype=torch.float32), "score": score}


    def __len__(self):
        return len(self.df)

In [8]:
config = ['ABV', 'Min IBU',
       'Max IBU', 'Astringency', 'Body', 'Alcohol', 'Bitter', 'Sweet', 'Sour',
       'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty']

val_size = int(0.1*len(beer_train))
beerDataset = BeerDataset(beer_train[:-val_size], config=config)
valDataset = BeerDataset(beer_train[val_size:], config=config)

In [9]:
beerDataset[1208]

{'attrs': tensor([ 4., 18., 25., 23., 25.,  6., 26., 26., 24.,  0., 17., 37.,  0., 47.]),
 'score': tensor(2)}

In [34]:
valDataset[0]['attrs'].numpy()

array([  6.,  20.,  38.,  36.,  59.,  14.,  39.,  71.,  20.,   0.,  23.,
        68.,  17., 107.], dtype=float32)

In [10]:
class BeerReviewPredictor(nn.Module):
    def __init__(self, config):
        super(BeerReviewPredictor, self).__init__()

        self.fc1 = nn.Linear(len(config), 256)
        self.fc2 = nn.Linear(256, 32)  # Embedding layer
        self.fc3 = nn.Linear(32, 5)    # Classifier layer

    def forward(self, x, return_embedding=False):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        if return_embedding:
            return x  # Return the embedding from fc2
        
        x = F.relu(self.fc3(x))
        return x  # Return the final output (classification)

    def get_embedding(self, x):
        """Extracts embedding by forwarding through fc1 and fc2 layers only."""
        x = F.relu(self.fc1(x))
        embedding = F.relu(self.fc2(x))
        return embedding


In [11]:
model = BeerReviewPredictor(config)


In [12]:
model.to('cuda')

BeerReviewPredictor(
  (fc1): Linear(in_features=14, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=5, bias=True)
)

In [13]:
model(torch.as_tensor([1 for i in range(len(config))], dtype=torch.float32, device='cuda').reshape(-1, len(config)))

tensor([[0.0000, 0.0000, 0.0451, 0.0723, 0.0594]], device='cuda:0',
       grad_fn=<ReluBackward0>)

In [14]:
# Dataloaders

trainset = DataLoader(beerDataset, batch_size=128, shuffle=True)
valset = DataLoader(valDataset, batch_size=128, shuffle=True)

In [15]:
for batch in trainset:
    print(batch['attrs'].shape, batch['score'].shape)
    break

torch.Size([128, 14]) torch.Size([128])


In [16]:
for row in range(len(beerDataset)):
    assert beerDataset[row]['attrs'] is not None and beerDataset[row]['score'] is not None

In [17]:
import torch.optim as optim
from tqdm import tqdm

optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)
loss_fn = nn.CrossEntropyLoss()
epochs = 25
device = 'cuda'
for epoch in range(epochs):
    total_train_loss = 0.0
    total_val_loss = 0.0

    model.train()
    for i, batch in enumerate(tqdm(trainset)):
        attrs, score = batch['attrs'].to(device), batch['score'].to(device)
        optimizer.zero_grad()
        output = model(attrs)
        loss = loss_fn(output, score)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    model.eval()
    for i, batch in enumerate(tqdm(valset)):
        attrs, score = batch['attrs'].to(device), batch['score'].to(device)
        optimizer.zero_grad()
        output = model(attrs)
        loss = loss_fn(output, score)
        total_val_loss += loss.item()

    scheduler.step()
    print(f"Train loss on epoch {epoch + 1}={total_train_loss/len(trainset)}")
    print(f"Val loss on epoch {epoch + 1}={total_val_loss / len(valset)}")

100%|██████████| 23/23 [00:00<00:00, 44.69it/s]
100%|██████████| 23/23 [00:00<00:00, 54.66it/s]


Train loss on epoch 1=2.3431308735971865
Val loss on epoch 1=1.6743327431056811


100%|██████████| 23/23 [00:00<00:00, 53.35it/s]
100%|██████████| 23/23 [00:00<00:00, 56.29it/s]


Train loss on epoch 2=1.6305992033170618
Val loss on epoch 2=1.6122573199479475


100%|██████████| 23/23 [00:00<00:00, 52.32it/s]
100%|██████████| 23/23 [00:00<00:00, 49.14it/s]


Train loss on epoch 3=1.610261336616848
Val loss on epoch 3=1.6092699403348176


100%|██████████| 23/23 [00:00<00:00, 52.69it/s]
100%|██████████| 23/23 [00:00<00:00, 53.42it/s]


Train loss on epoch 4=1.6088998836019766
Val loss on epoch 4=1.608322931372601


100%|██████████| 23/23 [00:00<00:00, 51.03it/s]
100%|██████████| 23/23 [00:00<00:00, 43.19it/s]


Train loss on epoch 5=1.6070731981940891
Val loss on epoch 5=1.6056390067805415


100%|██████████| 23/23 [00:00<00:00, 47.08it/s]
100%|██████████| 23/23 [00:00<00:00, 52.28it/s]


Train loss on epoch 6=1.5904312859410825
Val loss on epoch 6=1.5490433806958406


100%|██████████| 23/23 [00:00<00:00, 44.11it/s]
100%|██████████| 23/23 [00:00<00:00, 54.20it/s]


Train loss on epoch 7=1.3950560196586277
Val loss on epoch 7=1.1238246637841929


100%|██████████| 23/23 [00:00<00:00, 45.47it/s]
100%|██████████| 23/23 [00:00<00:00, 54.89it/s]


Train loss on epoch 8=0.9000018524086993
Val loss on epoch 8=0.7522065691325975


100%|██████████| 23/23 [00:00<00:00, 49.45it/s]
100%|██████████| 23/23 [00:00<00:00, 51.59it/s]


Train loss on epoch 9=0.6735836448876754
Val loss on epoch 9=0.6490424767784451


100%|██████████| 23/23 [00:00<00:00, 43.40it/s]
100%|██████████| 23/23 [00:00<00:00, 54.58it/s]


Train loss on epoch 10=0.6308282989522686
Val loss on epoch 10=0.6407285073529119


100%|██████████| 23/23 [00:00<00:00, 48.47it/s]
100%|██████████| 23/23 [00:00<00:00, 54.93it/s]


Train loss on epoch 11=0.6125464918820754
Val loss on epoch 11=0.6220527524533479


100%|██████████| 23/23 [00:00<00:00, 49.76it/s]
100%|██████████| 23/23 [00:00<00:00, 54.75it/s]


Train loss on epoch 12=0.6064919051916703
Val loss on epoch 12=0.618004964745563


100%|██████████| 23/23 [00:00<00:00, 51.92it/s]
100%|██████████| 23/23 [00:00<00:00, 54.45it/s]


Train loss on epoch 13=0.5942538652731024
Val loss on epoch 13=0.6092880614425825


100%|██████████| 23/23 [00:00<00:00, 45.64it/s]
100%|██████████| 23/23 [00:00<00:00, 54.98it/s]


Train loss on epoch 14=0.5966735197150189
Val loss on epoch 14=0.6102277571740358


100%|██████████| 23/23 [00:00<00:00, 49.70it/s]
100%|██████████| 23/23 [00:00<00:00, 50.28it/s]


Train loss on epoch 15=0.5961687914703203
Val loss on epoch 15=0.6041007806425509


100%|██████████| 23/23 [00:00<00:00, 51.06it/s]
100%|██████████| 23/23 [00:00<00:00, 55.57it/s]


Train loss on epoch 16=0.5874321564384128
Val loss on epoch 16=0.6030888401943705


100%|██████████| 23/23 [00:00<00:00, 51.46it/s]
100%|██████████| 23/23 [00:00<00:00, 55.49it/s]


Train loss on epoch 17=0.5869603325491366
Val loss on epoch 17=0.601349580547084


100%|██████████| 23/23 [00:00<00:00, 41.53it/s]
100%|██████████| 23/23 [00:00<00:00, 41.75it/s]


Train loss on epoch 18=0.5835525212080582
Val loss on epoch 18=0.6037734513697417


100%|██████████| 23/23 [00:00<00:00, 42.24it/s]
100%|██████████| 23/23 [00:01<00:00, 15.84it/s]


Train loss on epoch 19=0.5846742819184843
Val loss on epoch 19=0.5965959274250529


100%|██████████| 23/23 [00:01<00:00, 11.86it/s]
100%|██████████| 23/23 [00:00<00:00, 56.22it/s]


Train loss on epoch 20=0.5812904173913209
Val loss on epoch 20=0.594429832437764


100%|██████████| 23/23 [00:00<00:00, 47.44it/s]
100%|██████████| 23/23 [00:00<00:00, 54.26it/s]


Train loss on epoch 21=0.580421031817146
Val loss on epoch 21=0.5981192679508872


100%|██████████| 23/23 [00:00<00:00, 54.52it/s]
100%|██████████| 23/23 [00:00<00:00, 54.83it/s]


Train loss on epoch 22=0.5817027130852575
Val loss on epoch 22=0.5972592558549799


100%|██████████| 23/23 [00:00<00:00, 54.24it/s]
100%|██████████| 23/23 [00:00<00:00, 53.67it/s]


Train loss on epoch 23=0.5779754195524298
Val loss on epoch 23=0.5951123431972836


100%|██████████| 23/23 [00:00<00:00, 49.84it/s]
100%|██████████| 23/23 [00:00<00:00, 55.01it/s]


Train loss on epoch 24=0.5830880882947341
Val loss on epoch 24=0.5929031566433285


100%|██████████| 23/23 [00:00<00:00, 51.39it/s]
100%|██████████| 23/23 [00:00<00:00, 53.92it/s]

Train loss on epoch 25=0.5807438013346299
Val loss on epoch 25=0.5957893781040026





In [31]:
import faiss
import numpy as np

# Example embeddings and beer IDs
embedding_dim = 32
num_embeddings = 2878  # Replace with actual number of embeddings

embeddings = []
beer_ids = []
index = faiss.IndexFlatL2(embedding_dim)
# Randomly generated embeddings for demonstration
for idx in range(len(beerDataset)):
    embeddings.append(model.get_embedding(beerDataset[idx]['attrs'].to('cuda')).detach().cpu().numpy())
    beer_ids.append(idx)  # Replace with actual IDs

# Initialize a FAISS index
  # L2 distance index
index.add(np.array(embeddings))  # Add embeddings to the FAISS index

# Save beer IDs in a separate dictionary for retrieval
id_mapping = {i: beer_id for i, beer_id in enumerate(beer_ids)}

# Optional: Save FAISS index and ID mapping for later use
faiss.write_index(index, 'beer_embeddings.index')
np.save('beer_id_mapping.npy', id_mapping)


{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: 42,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 55: 55,
 56: 56,
 57: 57,
 58: 58,
 59: 59,
 60: 60,
 61: 61,
 62: 62,
 63: 63,
 64: 64,
 65: 65,
 66: 66,
 67: 67,
 68: 68,
 69: 69,
 70: 70,
 71: 71,
 72: 72,
 73: 73,
 74: 74,
 75: 75,
 76: 76,
 77: 77,
 78: 78,
 79: 79,
 80: 80,
 81: 81,
 82: 82,
 83: 83,
 84: 84,
 85: 85,
 86: 86,
 87: 87,
 88: 88,
 89: 89,
 90: 90,
 91: 91,
 92: 92,
 93: 93,
 94: 94,
 95: 95,
 96: 96,
 97: 97,
 98: 98,
 99: 99,
 100: 100,
 101: 101,
 102: 102,
 103: 103,
 104: 104,
 105: 105,
 106: 106,
 107: 107,
 108: 108,
 109: 109,
 110: 110,