In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import Dataset, DataLoader

In [2]:
# Show all rows
pd.set_option('display.max_rows', None)

# Show all columns
pd.set_option('display.max_columns', None)

In [3]:
vocab = pd.read_csv("vocab.csv")

vocab.loc[-1] = "unk"
vocab = vocab.reset_index(drop=True)
vocab["1"] = "none"

labels = [None, None, None, None, None, None, None, None, None, pd.Series("pizza"), pd.Series(["hold","without","no","avoid","hate","ani"])]
csv_file_names = ["number", "size", "none","topping","quantity","style","drink_type","container_type","volume"]

for i, csv in zip(range(0,len(labels)), csv_file_names):
    labels[i] = pd.read_csv(f"./labels/{csv}.csv").iloc[:,0]
    labels[i] = labels[i].str.strip()
    
for i in range(0,11):
    if i != 2:
        labels[2] = labels[2][~labels[2].isin(labels[i])]
        labels[2].dropna(inplace=True)
        labels[2] = labels[2].reset_index(drop=True)
for i in range(1,11):
    labels[i] = labels[i][~labels[i].isin(labels[0])]

csv_file_names.extend(["pizza","neg"])
for i in range(0,11):
    vocab.loc[vocab["0"].isin(labels[i]),"1"] = csv_file_names[i]
vocab.to_csv("labeled entities.csv",index=False)

In [4]:
def one_hot_encoding(vocab):
    vocab_length = vocab.shape[0]
    unlabeled_vocab = vocab.to_numpy().reshape(-1,1)
    encoder = OneHotEncoder()
    encoder = encoder.fit(unlabeled_vocab)
    trans_encoded = encoder.transform(unlabeled_vocab)

    return vocab_length, encoder, trans_encoded

In [5]:
# this will be used to get the embedding
vocab_length,  vocab_encoder, encoded_vocab = one_hot_encoding(vocab[vocab.columns[0]])
# this will be as used in the output 
labels_length, label_encoder, _ = one_hot_encoding(pd.Series(csv_file_names))
encoded_labels = label_encoder.transform(vocab["1"].to_numpy().reshape(-1,1))

In [6]:
# we can use the DataSet class from pytorch to facilitate 
# batch divisions and data preparation
class SimpleDataset(Dataset):
    def __init__(self, input_indices, labels):
        self.input_indices = input_indices
        self.labels = labels

    def __len__(self):
        return len(self.input_indices)

    def __getitem__(self, idx):
        return self.input_indices[idx], self.labels[idx]

In [None]:
# The nn.embedding take Indices by which it returns vectors


# it's common to use embedding size = 100 
# embedding neural network takes 2 sizes : Input (No. of vocab since i use one hot encoding )
# Hidden layer size : embedding
embedding_dim= 100

# returns the integer representation of the one hot vectors -> needed by nn.embedding
input_indices  = encoded_vocab.toarray()
output_indices = encoded_labels.toarray()

test_tensor = torch.from_numpy(input_indices).type(torch.long).to(torch.device("cuda"))
label_tensor = torch.from_numpy(output_indices).type(torch.float32).to(torch.device("cuda"))

# to be able to train the nn.Embedding on your data
# we inherit from nn.Module and construct the custom word embedding 
# what we do : 
# ONE hot encodes input dimension = Vocab -> Neural network dimension = Embedding ->output to hidden ->  dimension = Labels 
# use softmax to get what label


class CustomWordEmbedding(nn.Module):
    def __init__(self, vocab_length, embedding_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_length,embedding_dim, device=torch.device("cuda"))
        self.hidden_to_output = nn.Linear(embedding_dim, 11, bias= False, device= torch.device("cuda"))
  
        self.loss = nn.CrossEntropyLoss()

    # this function has a problem
    def forward(self, input):
        hidden = self.embed(input)

        hidden_mean = hidden.mean(dim = 1 )

        output = self.hidden_to_output(hidden_mean)
        
        output = nn.functional.log_softmax(output, dim = -1)
       
        return output

# number of iterations (epoch is a standard way of saying iteration)
# each epoch has N(4) batches so the loop should run 400 times
epochs =10000

dataset = SimpleDataset(test_tensor,label_tensor)

dataloader = DataLoader(dataset, batch_size=4 )

model = CustomWordEmbedding(vocab_length,100)

# stochastic gradient descent
optimizer = torch.optim.Adam(model.embed.parameters(), lr = 0.001)

for epoch in range(epochs):

    model.train()

    optimizer.zero_grad()

    for batch_id, (indices, labels) in enumerate(dataloader):
        outputs = model(indices)
        
        loss = model.loss(outputs, labels)
        
        loss.backward()
        
        optimizer.step()
        
        # Print loss for each batch
        if (batch_id + 1) % 2 == 0:  # Print every 2 batches
            print(f"Epoch [{epoch+1}/{epochs}], Batch [{batch_id+1}/{len(dataloader)}], Loss: {loss.item()}")

    # Optionally, print loss for every epoch
    print(f"Epoch [{epoch+1}/{epochs}] completed, Loss: {loss.item()}")

Epoch [1/50], Batch [2/56], Loss: 2.2672531604766846
Epoch [1/50], Batch [4/56], Loss: 2.324179172515869
Epoch [1/50], Batch [6/56], Loss: 2.130316734313965
Epoch [1/50], Batch [8/56], Loss: 2.644019365310669
Epoch [1/50], Batch [10/56], Loss: 2.2668538093566895
Epoch [1/50], Batch [12/56], Loss: 2.2129769325256348
Epoch [1/50], Batch [14/56], Loss: 2.1485743522644043
Epoch [1/50], Batch [16/56], Loss: 2.266751766204834
Epoch [1/50], Batch [18/56], Loss: 2.1902406215667725
Epoch [1/50], Batch [20/56], Loss: 2.252068042755127
Epoch [1/50], Batch [22/56], Loss: 2.1068315505981445
Epoch [1/50], Batch [24/56], Loss: 2.5176708698272705
Epoch [1/50], Batch [26/56], Loss: 2.2976298332214355
Epoch [1/50], Batch [28/56], Loss: 2.323777675628662
Epoch [1/50], Batch [30/56], Loss: 2.1402435302734375
Epoch [1/50], Batch [32/56], Loss: 2.2074384689331055
Epoch [1/50], Batch [34/56], Loss: 2.421494483947754
Epoch [1/50], Batch [36/56], Loss: 2.214162826538086
Epoch [1/50], Batch [38/56], Loss: 2.068

In [94]:
word = "pizza"
hefney = vocab_encoder.transform([[word]])
tensor_hefney = torch.from_numpy(hefney.toarray()).type(torch.long).to(torch.device("cuda"))

In [95]:
output = model(tensor_hefney)

In [96]:
output = output.to(torch.device("cpu"))


In [97]:
output = output.detach().numpy()

In [100]:
torch.save(model.state_dict(), "model_weights.pth")

In [101]:
model.state_dict()

OrderedDict([('embed.weight',
              tensor([[ 1.2993e+00,  1.4934e+00,  3.5305e-01,  ..., -3.7902e-01,
                       -6.4480e-02,  5.1436e-01],
                      [ 1.0336e-01,  1.5277e+00,  4.8985e-01,  ...,  6.7988e-01,
                        1.3882e+00, -4.6104e-01],
                      [ 3.9099e-01, -1.9247e+00,  5.9736e-06,  ...,  8.3583e-01,
                        2.6491e-01, -4.0906e-01],
                      ...,
                      [-2.8443e-01,  1.0784e+00,  1.1228e-01,  ..., -1.1314e+00,
                       -1.2946e-01,  5.4189e-01],
                      [ 1.7681e-01,  9.5952e-01, -7.6338e-01,  ..., -5.5910e-01,
                       -1.8051e+00, -1.2685e+00],
                      [ 1.7878e+00, -1.2707e-01, -1.3408e+00,  ..., -4.4695e-01,
                        1.0022e+00, -4.8933e-01]], device='cuda:0')),
             ('hidden_to_output.weight',
              tensor([[ 0.0333,  0.0058,  0.0297,  ..., -0.0585,  0.0722,  0.0548],
            