In [None]:
import utils
import gc
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim.lr_scheduler as lr_scheduler
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import MWETokenizer
import importlib
import re
from word2number import w2n
import pickle
importlib.reload(utils)
import torch.nn as nn

# STEP 1 READ THE DATA
read_dataset: takes path to JSON file that has sentences, _.EXR, _.TOP, _.TOP_DECOUPLED
and returns them as pandas.Series

In [None]:
sentences, parsed_tree, structured_sentence, decoupled_structured_sentence = utils.read_dataset("./PIZZA_train.json")

In [None]:
# don't need them so free the data
del parsed_tree
del decoupled_structured_sentence
gc.collect()

# STEP 2: Parse data and extract labels
In this step we build our Multiword expressions, extract the labels of every token

In [None]:
# do this if u didn't read the pizza_train.json above
structured_sentence = pd.read_csv("TOP.csv")

In [None]:
# do this if u read the TOP.csv
structured_sentence = structured_sentence.iloc[:,0]

In [None]:

pizza_orders, drink_orders, none_words = utils.extract_pizza_drinks(structured_sentence.copy())

none_words = none_words.drop_duplicates()

none_words = utils.pre_text_normalization(none_words)

none_words = none_words.reset_index(drop=True)

nones, _ = utils.tokenization(none_words)


In [None]:
pizza_nodes, drink_nodes = utils.extract_nodes(pizza_orders,drink_orders)
pizza_nodes, drink_nodes = utils.clean_extracted_nodes(pizza_nodes, drink_nodes)
pizza_number, pizza_size, pizza_none, topping , quantity, style = pizza_nodes
drink_number, drink_size, drink_none, drink_type, container_type, volume = drink_nodes

In [None]:
number = pd.concat([pizza_number,drink_number])
size = pd.concat([pizza_size,drink_size])
none = pd.concat([pizza_none,drink_none])
number.drop_duplicates(inplace=True)
size.drop_duplicates(inplace=True)
none.drop_duplicates(inplace=True)

In [None]:
# we will treat the volume differently than other nodes to make sure i take into consideration the measuring units only
vocab, _ = utils.tokenization(volume)
volume_vocab = set()
for word in vocab:
    try:
        _ = w2n.word_to_num(word)
        number[-1] = word
        number.reset_index(drop=True,inplace=True)
        number.drop_duplicates(inplace=True)
    except ValueError:
        volume_vocab.add(word)

In [None]:
volume_vocab = pd.Series(list(volume_vocab))
volume_vocab.to_csv(f"./labels/volume.csv", index=False)

In [None]:

labels = [number, size, none, topping, quantity, style, drink_type, container_type, volume]
csv_file_names = ["number", "size", "none","topping","quantity","style","drink_type","container_type"]
# merge nones that came from inside the PIZZAORDER, DRINKORDER and what u got from outside them
none_vocab = nones
mwe =[]
for label, csv in zip(labels, csv_file_names):
    if csv != "none":
        _, tokens = utils.tokenization(label)
        tokens.drop_duplicates(inplace=True)
        vocab = set()
        for col in tokens.columns:
            tokens.loc[tokens[col] == 0,col] = ""
        tokens = tokens.to_numpy().tolist()
        for i,token_list in enumerate(tokens):
            while "" in token_list:
                token_list.remove("")
            if len(token_list) == 1:
                vocab.add(token_list[0])
            else:
                mwe.append(tuple(token_list))
                string = "_".join(token_list)
                string = re.sub("_+$","",string)
                vocab.add(string)
    else:
        vocab, _ = utils.tokenization(label)
        vocab.update(none_vocab)

    vocab = pd.Series(list(vocab))
    vocab.to_csv(f"./labels/{csv}.csv", index=False)


In [None]:
# save the state of the MWETOKENIZER to be used when processing the sentences
tokenizer = MWETokenizer(mwe)
with open("MWE_TOKENS.pkl", 'wb') as file:
    pickle.dump(tokenizer,file)


In [None]:
del vocab
del labels
del csv_file_names
del pizza_nodes
del pizza_orders
del drink_nodes
del drink_orders


# STEP 3 : preprocess data
##### What we should take into consideration? 
1- Word Normalization  
2- Word Tokenization  
Why we won't use Sentence segmentation?  
It's useless, orders are one sentence question no clear punctuation exist

In [None]:
# NORMALIZATION
normalized_sentence = utils.pre_text_normalization(sentences.copy())

In [None]:
# tokenization
vocab, tokenized_sentences = utils.tokenization(normalized_sentence,tokenizesentences=1)


In [None]:
# checkpoint: Normalization and tokenization of sentences
vocab_as_series = pd.Series(list(vocab))
vocab_as_series.to_csv("vocab.csv",index=False)
tokenized_sentences.to_csv("tokenized_sentences.csv",index=False)

In [None]:
del vocab
del tokenized_sentences

# STEP 4: Encode The tokens and label them

In [None]:
vocab, vocab_encoder, label_encoder = utils.create_labeled_vocab(None)

In [None]:
# made sure that the encoding is correct
print(vocab_encoder.categories_[0][([vocab[vocab["tokens"] == "bbq_pulled_pork"].loc[vocab[vocab["tokens"] == "bbq_pulled_pork"].index[0],"encoded_tokens"]])])
print(label_encoder.categories_[0][([vocab[vocab["tokens"] == "bbq_pulled_pork"].loc[vocab[vocab["tokens"] == "bbq_pulled_pork"].index[0],"encoded_labels"]])])


In [None]:
tokenized_sentences = pd.read_csv("tokenized_sentences.csv")

In [None]:
convertor = utils.conversions(vocab)

In [None]:
tokens_as_ids = tokenized_sentences.map(convertor.word2id)


In [None]:
labels_as_ids = tokenized_sentences.map(convertor.word2labels)


In [None]:
# from here on screw Pandas we only work with numpy, and tensors
tokens_ids_as_numpy = tokens_as_ids.to_numpy()
tokens_labels_as_numpy = labels_as_ids.to_numpy()

In [None]:
del tokenized_sentences
del tokens_as_ids
del labels_as_ids
gc.collect()

In [None]:
# check i did encode correctly:
encode_test = tokens_ids_as_numpy[0][~np.isnan(tokens_ids_as_numpy[0])] 
series = vocab[vocab["tokens"] == vocab_encoder.categories_[0][int(tokens_ids_as_numpy[0][3])] ]

index = series.index[0]

series1, series2 = series.loc[index,"encoded_labels"] , series.loc[index,"encoded_tokens"] 

print(vocab_encoder.categories_[0][series2])
print(label_encoder.categories_[0][series1])
# print(len(tokenized_sentences.loc[0]) == len(tokenized_sentences.loc[0]))


In [None]:
# the memory will be bad when we transform the numpy to tensor
# we need to split them and save them on disk so we can load batches when we train
tokens_batches = np.array_split(tokens_ids_as_numpy,10)
labels_batches = np.array_split(tokens_labels_as_numpy,10)
del tokens_ids_as_numpy
del tokens_labels_as_numpy
gc.collect()


In [None]:
for i, batch in enumerate(tokens_batches):
    tensor_batch = torch.from_numpy(batch).type(torch.float32)
    torch.save(tensor_batch,f"./tokens_tensors/tokens_batch_{i}.pt")
for i, batch in enumerate(labels_batches):
    tensor_batch = torch.from_numpy(batch).type(torch.float32)
    torch.save(tensor_batch,f"./labels_tensors/labels_batch_{i}.pt")
del tensor_batch
del tokens_batches
del labels_batches

# STEP 5: MODEL

In [None]:
# determine what to run my tensors on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# my one hot encoders DETERMINSTIC ON WHAT CRITERIA I WILL TRAIN ON
input_size = vocab.shape[0]
# this is a parameter 
hidden_size = 300
# output : num of classes
num_classes = 11
# num of trials (epochs)
epochs = 10
# Batch size = ? 
batch_size = 10
# learning_rate
lr = 0.01

# num_layers in RNN default is 1 (increasing layers improve result but worsen the time)

In [None]:

# What is the model i will use
model = utils.RNN(input_size,num_classes,hidden_size)
# loss criteria here i use CEloss
loss_criterion = utils.nn.CrossEntropyLoss(ignore_index=-1)
# stochastic gradient descent
optimizer = torch.optim.SGD(model.parameters(),lr)
# they say decaying learning rate is better than fixed one so i will use learning rate scheduler
lambdalr = lambda epoch: epoch / 10
scheduler = lr_scheduler.LambdaLR(optimizer,lambdalr)


In [None]:

for epoch in range(10):
    
    for i in range(9):
        tokens = torch.load(f"./tokens_tensors/tokens_batch_{i}.pt",weights_only=True).type(torch.int64)
        labels = torch.load(f"./labels_tensors/labels_batch_{i}.pt",weights_only=True).type(torch.int64)

        dataset = utils.SimpleDataset(tokens,labels)
        data = DataLoader(dataset,batch_size=batch_size,shuffle=True)
        
        del tokens

        model.train()
        
        for input_tensors, label_tensors in data:
            
            input_tensors = input_tensors.to(device)
            
            label_tensors = label_tensors.to(device)
           
            out_tensor = model(input_tensors)
            
            loss = loss_criterion(out_tensor.view(-1,out_tensor.shape[-1]),label_tensors.view(-1))
            
            optimizer.zero_grad()

            loss.backward()
            
            optimizer.step()

        # Here should be the evaluation after every epoch
        # no grad so that pytorch doesn't insert it in his calculations
        all_preds = []
        all_labels = []
        with torch.no_grad():
            # this is a hold-k evaluation (where we hold k from training set and evaluate based on it )
            # till i parse the evaluation
            tokens = torch.load(f"./tokens_tensors/tokens_batch_{9}.pt",weights_only=True).type(torch.int)
            labels = torch.load(f"./labels_tensors/labels_batch_{9}.pt",weights_only=True).type(torch.int)
    
            for inputs, labels in data:
                    inputs, labels = inputs.to(device), labels.to(device)

                    # Forward pass
                    outputs = model(inputs)

            # Get predictions
            preds = torch.argmax(outputs, dim=2)  # Shape: (batch_size, seq_length, features) dim = 2 : features

            # Flatten predictions and labels
            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(torch.argmax(labels,dim=2).cpu().numpy().flatten())
        accuracy = accuracy_score(all_labels, all_preds)

        print(f"epoch {epoch}'s Accuracy:", accuracy)

    scheduler.step()
    print(f"epoch {epoch}:, loss ={loss.item()}")