In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import Dataset, DataLoader

In [2]:
# Show all rows
pd.set_option('display.max_rows', None)

# Show all columns
pd.set_option('display.max_columns', None)

In [3]:
vocab = pd.read_csv("vocab.csv")

vocab.loc[-1] = "unk"
vocab = vocab.reset_index(drop=True)
vocab["1"] = "none"

labels = [None, None, None, None, None, None, None, None, None, pd.Series("pizza"), pd.Series(["hold","without","no","avoid","hate","ani"])]
csv_file_names = ["number", "size", "none","topping","quantity","style","drink_type","container_type","volume"]

for i, csv in zip(range(0,len(labels)), csv_file_names):
    labels[i] = pd.read_csv(f"./labels/{csv}.csv").iloc[:,0]
    labels[i] = labels[i].str.strip()
    
for i in range(0,11):
    if i != 2:
        labels[2] = labels[2][~labels[2].isin(labels[i])]
        labels[2].dropna(inplace=True)
        labels[2] = labels[2].reset_index(drop=True)
for i in range(1,11):
    labels[i] = labels[i][~labels[i].isin(labels[0])]

csv_file_names.extend(["pizza","neg"])
for i in range(0,11):
    vocab.loc[vocab["0"].isin(labels[i]),"1"] = csv_file_names[i]
vocab.to_csv("labeled entities.csv",index=False)

In [4]:
def one_hot_encoding(vocab):
    unlabeled_vocab = vocab.to_numpy().reshape(-1,1)
    
    encoder = OneHotEncoder()
    
    encoder = encoder.fit(unlabeled_vocab)

    return encoder

In [5]:
# this will be used later to train the NN embedding
vocab_encoder = one_hot_encoding(vocab[vocab.columns[0]])

# this will be as used as our target outputs 
label_encoder = one_hot_encoding(pd.Series(csv_file_names))

encoded_labels = label_encoder.transform(vocab["1"].to_numpy().reshape(-1,1))
encoded_tokens = vocab_encoder.transform(vocab["0"].to_numpy().reshape(-1,1))


In [6]:
del labels
del csv_file_names

In [7]:
vocab.rename(columns={"0": "tokens","1": "labels"},inplace=True)

vocab["encoded_tokens"] = pd.Series([x.toarray()[0] for x in encoded_tokens])
vocab["encoded_labels"] = pd.Series([x.toarray()[0] for x in encoded_labels])

In [8]:
# made sure that the encoding is correct
print(vocab_encoder.inverse_transform([vocab[vocab["tokens"] == "'d"].loc[vocab[vocab["tokens"] == "'d"].index[0],"encoded_tokens"]]))
print(label_encoder.inverse_transform([vocab[vocab["tokens"] == "'d"].loc[vocab[vocab["tokens"] == "'d"].index[0],"encoded_labels"]]))


[["'d"]]
[['none']]


In [9]:
def array2number(array):
    if isinstance(array, np.ndarray):
        return array.argmax(axis=0)
    else:
        return 1000 # it should be an impossible number so that i can use it as pad
    # but because i will save this on disk, saving it as a number will make it easy for me 
    # and my vocab is 230 so i will never reach 1000

In [96]:
token_to_id = dict(zip(vocab["tokens"], vocab["encoded_tokens"]))

token_to_id[None] = np.zeros(223)
token_to_label = dict(zip(vocab["tokens"], vocab["encoded_labels"]))
token_to_label[None] = np.zeros(11)

codes = vocab["encoded_tokens"].map(array2number)

id_to_token = dict(zip(codes,vocab["tokens"]))

codes = vocab["encoded_labels"].map(array2number)

id_to_label = dict(zip(codes,vocab["labels"]))

def word2id(word):
    return token_to_id.get(word, None)
def word2labels(word):
      return token_to_label.get(word, None)
def id2token(number):
     return id_to_token.get(number,"unk")


In [53]:
# check conversions works correctly
print(id2token(array2number(word2id("pizza"))))
print(id2token(array2number(word2id("cheeseburg"))))
#unknown word
print(id2token(array2number(word2id("pizzas"))))


pizza
cheeseburg
unk


In [12]:
# we can use the DataSet class from pytorch to facilitate 
# batch divisions and data preparation
class SimpleDataset(Dataset):
    def __init__(self, input_indices, labels):
        self.input_indices = input_indices
        self.labels = labels

    def __len__(self):
        return len(self.input_indices)

    def __getitem__(self, idx):
        return self.input_indices[idx], self.labels[idx]

In [13]:
class CustomWordEmbedding(nn.Module):
    def __init__(self, vocab_length, embedding_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_length,embedding_dim, device=torch.device("cuda"))
        self.hidden_to_output = nn.Linear(embedding_dim, 11, bias= False, device= torch.device("cuda"))
  
        self.loss = nn.CrossEntropyLoss()

    # this function has a problem Not correct for now
    def forward(self, input):
        hidden = self.embed(input)

        hidden_mean = hidden.mean(dim = 1 )

        output = self.hidden_to_output(hidden_mean)
        
        output = nn.functional.log_softmax(output, dim = -1)
       
        return output

In [14]:
def train_embedding(embedding_dim,vocab, encoded_tokens,encoded_labels):
# The nn.embedding take Indices by which it returns vectors

# returns the integer representation of the one hot vectors -> needed by nn.embedding
    input_indices  = encoded_tokens.toarray()
    output_indices = encoded_labels.toarray()

    tokens_tensor = torch.from_numpy(input_indices).type(torch.long).to(torch.device("cuda"))
    label_tensor  = torch.from_numpy(output_indices).type(torch.float32).to(torch.device("cuda"))

# to be able to train the nn.Embedding on our data
# we inherit from nn.Module and construct the custom word embedding 
# what we do : 
# ONE hot encodes input dimension = Vocab -> Neural network dimension = Embedding ->output to hidden ->  dimension = Labels 
# use softmax to get what label


# number of iterations (epoch is a standard way of saying iteration)
# each epoch has N(4) batches so the loop should run 400 times
    epochs =100

    dataset = SimpleDataset(tokens_tensor,label_tensor)

    dataloader = DataLoader(dataset, batch_size=4 )

    model = CustomWordEmbedding(vocab.shape[0],100)

    # stochastic gradient descent
    optimizer = torch.optim.SGD(model.embed.parameters(), lr = 0.0001)

    for epoch in range(epochs):

        model.train()

        optimizer.zero_grad()

        for batch_id, (indices, labels) in enumerate(dataloader):
            outputs = model(indices)
            
            loss = model.loss(outputs, labels)
            
            loss.backward()
            
            optimizer.step()
            
            # Print loss for each batch
            if (batch_id + 1) % 2 == 0:  # Print every 2 batches
                print(f"Epoch [{epoch+1}/{epochs}], Batch [{batch_id+1}/{len(dataloader)}], Loss: {loss.item()}")

        # Optionally, print loss for every epoch
        print(f"Epoch [{epoch+1}/{epochs}] completed, Loss: {loss.item()}")
        return model

In [15]:
# it's common to use embedding size = 100 
# embedding neural network takes 2 sizes : Input (No. of vocab since i use one hot encoding )
# Hidden layer size : embedding
# Doesn't work correctly for now
# embed = train_embedding(100,vocab,encoded_tokens,encoded_labels)

In [99]:
tokenized_sentences = pd.read_csv("tokenized_sentences.csv")

In [100]:
# my objective is to convert sequence of words into tensors
# first read the data 

tokenized_sentences = tokenized_sentences.reset_index(drop = True)
tokenized_sentences = tokenized_sentences["train.SRC"].str.replace(r"([\[\],\"]|'(?!d\b))", "",regex=True)
tokenized_sentences = tokenized_sentences.str.split(" ",expand=True)


In [54]:
ids = tokenized_sentences.map(word2id)

In [101]:
labels = tokenized_sentences.map(word2labels)

In [70]:
ids_as_numpy = ids.to_numpy()
labels_as_numpy = labels.to_numpy()

In [73]:
# check i did encode correctly: 
series = vocab[vocab["tokens"] == vocab_encoder.inverse_transform([ids_as_numpy[0][4]])[0][0]]

index = series.index[0]

series1, series2 = series.loc[index,"encoded_labels"] , series.loc[index,"encoded_tokens"] 

print(vocab_encoder.inverse_transform([series2]))
print(label_encoder.inverse_transform([series1]))
print(len(tokenized_sentences.loc[0]) == len(tokenized_sentences.loc[0]))


[['larg']]
[['size']]
True


In [84]:
ids_as_numpy_fixed = np.array(ids_as_numpy.tolist(), dtype=np.uint8)


In [None]:
del ids
del tokenized_sentences
del ids_as_numpy
del labels

In [89]:
tokens_tensor = torch.from_numpy(ids_as_numpy_fixed).type(torch.int8)


In [90]:
torch.save(tokens_tensor,"tokens.pt")

In [91]:
del tokens_tensor

In [92]:
del ids_as_numpy_fixed

In [105]:
labels_as_numpy_fixed = np.array(labels_as_numpy.tolist(), dtype=np.uint8)


In [106]:
del labels_as_numpy

In [107]:
label_tensor = torch.from_numpy(labels_as_numpy_fixed).type(torch.int8)

In [108]:
torch.save(label_tensor,"labels.pt")

In [109]:
del label_tensor