In [1]:
import utils
import gc
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch.optim.lr_scheduler as lr_scheduler
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import MWETokenizer
import importlib
import re
from word2number import w2n
import pickle
importlib.reload(utils)


<module 'utils' from 'c:\\Users\\abdo_\\Downloads\\programs\\NLP\\Pizzaria\\utils.py'>

# STEP 1 READ THE DATA
read_dataset: takes path to JSON file that has sentences, _.EXR, _.TOP, _.TOP_DECOUPLED
and returns them as pandas.Series

In [3]:
sentences, parsed_tree, structured_sentence, decoupled_structured_sentence = utils.read_dataset("./PIZZA_train.json")

In [4]:
# don't need them so free the data
del parsed_tree
del decoupled_structured_sentence
gc.collect()

169

# STEP 2: Parse data and extract labels
In this step we build our Multiword expressions, extract the labels of every token

In [None]:
# do this if u didn't read the pizza_train.json above
structured_sentence = pd.read_csv("TOP.csv")

In [None]:
# do this if u read the TOP.csv
structured_sentence = structured_sentence.iloc[:,0]

In [None]:

pizza_orders, drink_orders, none_words = utils.extract_pizza_drinks(structured_sentence.copy())

none_words = none_words.drop_duplicates()

none_words = utils.pre_text_normalization(none_words)

none_words = none_words.reset_index(drop=True)

nones, _ = utils.tokenization(none_words)


In [None]:
# none words have a problem
# words like pizza, Negations are in this set -> extract them
pizza_class = "pizza"
negation_class = ["hold", "avoid", "hate", "without", "no"]
nones.remove(pizza_class)
for word in negation_class:
    nones.remove(word)
# here we save this nones, pizza, negation_classes
pd.Series(negation_class).to_csv(f"./labels/negation.csv", index=False)

In [None]:
pizza_nodes, drink_nodes = utils.extract_nodes(pizza_orders,drink_orders)
pizza_nodes, drink_nodes = utils.clean_extracted_nodes(pizza_nodes, drink_nodes)

In [None]:
pizza_number, pizza_size, topping , quantity, style = pizza_nodes
drink_number, drink_size, drink_type, container_type, volume = drink_nodes
number = pd.concat([pizza_number,drink_number])
size = pd.concat([pizza_size,drink_size])
number.drop_duplicates(inplace=True)
size.drop_duplicates(inplace=True)


In [None]:
# clean the nodes to contain only the meaningful words
vocab, _ = utils.tokenization(number)
number_vocab = set()
for word in vocab:
    try:
        _ = w2n.word_to_num(word)
        number_vocab.add(word)
    except ValueError:
        nones.add(word)
number = pd.Series(list(number_vocab))

In [None]:
vocab, _ = utils.tokenization(volume)
volume_vocab = set()
for word in vocab:
    try:
        _ = w2n.word_to_num(word)
        number[-1] = word
        number.reset_index(drop=True,inplace=True)
        number.drop_duplicates(inplace=True)
    except ValueError:
        volume_vocab.add(word)
volume = pd.Series(list(volume_vocab))

In [None]:
# i want to remove extra from size
size_vocab, _ = utils.tokenization(size)
topping_vocab, _ = utils.tokenization(topping)
quantity_vocab, _ = utils.tokenization(quantity)
style_vocab, _ = utils.tokenization(style)


In [None]:
for style in style_vocab.copy():
    if style in topping_vocab:
        style_vocab.remove(style)
    if style in nones:
        style_vocab.remove(style)
for size in size_vocab.copy():
    if size in quantity_vocab:
        size_vocab.remove(size)
nones.update({"only", "just"})
quantity_vocab.remove("only")
quantity_vocab.remove("just")
quantity_vocab.remove("not")

In [None]:
size = pd.Series(list(size_vocab))
quantity = pd.Series(list(quantity_vocab))
style = pd.Series(list(style_vocab))
topping = pd.Series(list(topping_vocab))

In [None]:
# only for drink types i will use multiword tokenizer 
# from where will he invent new drink ?
drink_type_vocab, tokens = utils.tokenization(drink_type)
mwe =[]
vocab = set()
for col in tokens.columns:
        tokens.loc[tokens[col] == 0,col] = ""
tokens = tokens.to_numpy().tolist()
for i,token_list in enumerate(tokens):
        while "" in token_list:
                token_list.remove("")
        if len(token_list) == 1:
                vocab.add(token_list[0])
        else:
                mwe.append(tuple(token_list))
                string = "_".join(token_list)
                string = re.sub(r"_+$","",string)
                vocab.add(string)
drink_type_vocab = vocab

In [None]:
tokenizer = MWETokenizer(mwe)
with open("DRINK_MWE_TOKENS.pkl", 'wb') as file:
    pickle.dump(tokenizer,file)

In [None]:
container_vocab, _ = utils.tokenization(container_type)
container_vocab.remove("one")
container_vocab.remove("in")
nones.add("in")
container_type = pd.Series(list(container_vocab))
drink_type = pd.Series(list(drink_type_vocab))


In [None]:
volume_vocab = pd.Series(list(volume_vocab))
volume_vocab.to_csv(f"./labels/volume.csv", index=False)
nones = pd.Series(list(nones))
labels = [number, size, nones, topping, quantity, style, drink_type, container_type, volume]
csv_file_names = ["number", "size", "none","topping","quantity","style","drink_type","container_type"]
for vocab, csv in zip(labels,csv_file_names):
    vocab.to_csv(f"./labels/{csv}.csv", index=False)

In [None]:

labels = [number, size, nones, topping, quantity, style, drink_type, container_type, volume]
csv_file_names = ["number", "size", "none","topping","quantity","style","drink_type","container_type"]
# merge nones that came from inside the PIZZAORDER, DRINKORDER and what u got from outside them
none_vocab = nones
mwe =[]
for label, csv in zip(labels, csv_file_names):
    if csv != "none":
        _, tokens = utils.tokenization(label)
        tokens.drop_duplicates(inplace=True)
        vocab = set()
        for col in tokens.columns:
            tokens.loc[tokens[col] == 0,col] = ""
        tokens = tokens.to_numpy().tolist()
        for i,token_list in enumerate(tokens):
            while "" in token_list:
                token_list.remove("")
            if len(token_list) != 1:
                mwe.append(tuple(token_list))
                string = " ".join(token_list)
                string = re.sub(r"\s$","",string)
                


In [None]:
# save the state of the MWETOKENIZER to be used when processing the sentences
tokenizer = MWETokenizer(mwe)
with open("MWE_TOKENS.pkl", 'wb') as file:
    pickle.dump(tokenizer,file)


In [None]:
del vocab
del labels
del csv_file_names
del pizza_nodes
del pizza_orders
del drink_nodes
del drink_orders


# STEP 3 : preprocess data
##### What we should take into consideration? 
1- Word Normalization  
2- Word Tokenization  
Why we won't use Sentence segmentation?  
It's useless, orders are one sentence question no clear punctuation exist

In [5]:
# NORMALIZATION
normalized_sentence = utils.pre_text_normalization(sentences.copy())

In [6]:
# tokenization
vocab, tokenized_sentences = utils.tokenization(normalized_sentence,tokenizesentences=1)


In [9]:
# checkpoint: Normalization and tokenization of sentences
vocab_as_series = pd.Series(list(vocab))
vocab_as_series.to_csv("vocab.csv",index=False)
tokenized_sentences.to_csv("tokenized_sentences.csv",index=False)

In [10]:
del vocab
del tokenized_sentences

# STEP 4: Encode The tokens and label them

In [2]:
vocab, vocab_encoder, label_encoder = utils.create_labeled_vocab(None)

In [91]:
# made sure that the encoding is correct
print(vocab_encoder.categories_[0][([vocab[vocab["tokens"] == "bbq"].loc[vocab[vocab["tokens"] == "bbq"].index[0],"encoded_tokens"]])])
print(label_encoder.categories_[0][([vocab[vocab["tokens"] == "bbq"].loc[vocab[vocab["tokens"] == "bbq"].index[0],"encoded_labels"]])])


['bbq']
['topping']


In [92]:
tokenized_sentences = pd.read_csv("tokenized_sentences.csv")
tokenized_sentences.replace([0,"0"],"PAD",inplace=True)

  tokenized_sentences = pd.read_csv("tokenized_sentences.csv")


In [16]:
convertor = utils.conversions(vocab)

In [94]:
tokens_as_ids = tokenized_sentences.map(convertor.word2id)

In [95]:
labels_as_ids = tokenized_sentences.map(convertor.word2labels)


In [98]:
tokens_as_ids.replace(1,0,inplace=True)

In [100]:
# لخبطة المودل هغير كام كلمة واحطهم ب UNK
unknown_id = convertor.word2id("UNK")
words_for_perplexion = ["ounce","peppperoni","fourteen","napolitana","broccoli"]
for i, word in enumerate(words_for_perplexion):
    words_for_perplexion[i] = convertor.word2id(word)
    tokens_as_ids.replace(words_for_perplexion[i],unknown_id,inplace=True)

In [101]:
# from here on screw Pandas we only work with numpy, and tensors
tokens_ids_as_numpy = tokens_as_ids.to_numpy()
tokens_labels_as_numpy = labels_as_ids.to_numpy()

In [102]:
del tokenized_sentences
del tokens_as_ids
del labels_as_ids
gc.collect()

7

In [103]:
# check i did encode correctly:
encode_test = tokens_ids_as_numpy[0][~np.isnan(tokens_ids_as_numpy[0])] 
series = vocab[vocab["tokens"] == vocab_encoder.categories_[0][int(tokens_ids_as_numpy[0][3])] ]

index = series.index[0]

series1, series2 = series.loc[index,"encoded_labels"] , series.loc[index,"encoded_tokens"] 

print(vocab_encoder.categories_[0][series2])
print(label_encoder.categories_[0][series1])
# print(len(tokenized_sentences.loc[0]) == len(tokenized_sentences.loc[0]))


one
number


In [104]:
# the memory will be bad when we transform the numpy to tensor
# we need to split them and save them on disk so we can load batches when we train
tokens_tensor = torch.from_numpy(tokens_ids_as_numpy).type(torch.float32)
torch.save(tokens_tensor,f"./tokens_tensors.pt")
labels_tensors = torch.from_numpy(tokens_labels_as_numpy).type(torch.float32)
torch.save(labels_tensors,f"./labels_tensors.pt")

# STEP 5: MODEL

In [3]:
print(vocab.shape[0])

225


In [30]:
# determine what to run my tensors on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# my one hot encoders DETERMINSTIC ON WHAT CRITERIA I WILL TRAIN ON
input_size = vocab.shape[0]
# this is a parameter 
hidden_size = 400
# output : num of classes
num_classes = 12
# num of trials (epochs)
epochs = 20
# Batch size = ? 
batch_size = 32
# learning_rate
lr = 0.5

# num_layers in RNN default is 1 (increasing layers improve result but worsen the time)

In [31]:

# What is the model i will use
model = utils.RNN(input_size,num_classes,hidden_size)
# loss criteria here i use CEloss
loss_criterion = utils.nn.CrossEntropyLoss(ignore_index=-1)
# stochastic gradient descent
optimizer = torch.optim.SGD(model.parameters(),lr)
# they say decaying learning rate is better than fixed one so i will use learning rate scheduler
lambdalr = lambda epoch: epoch / 10
scheduler = lr_scheduler.LambdaLR(optimizer,lambdalr)


In [None]:
tokens = torch.load(f"./tokens_tensors/tokens_batch_{4}.pt",weights_only=True).type(torch.int64)
labels = torch.load(f"./tokens_tensors/tokens_batch_{4}.pt",weights_only=True).type(torch.int64)

dataset = utils.SimpleDataset(tokens,labels)
subset_indices = torch.randperm(dataset.__len__())[:40000]
subset_sampler = SubsetRandomSampler(subset_indices)
data = DataLoader(dataset,batch_size=batch_size,sampler=subset_sampler)

In [None]:

for epoch in range(epochs):
    total_loss=0
    model.train()
        
    for input_tensors, label_tensors in data:
            
        input_tensors = input_tensors.to(device)
            
        label_tensors = label_tensors.to(device)
           
        out_tensor = model(input_tensors)
            
        loss = loss_criterion(out_tensor.view(-1,out_tensor.shape[-1]),label_tensors.view(-1))
            
        optimizer.zero_grad()
        
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)
        optimizer.step()

        # Here should be the evaluation after every epoch
        # no grad so that pytorch doesn't insert it in his calculations
    all_preds = []
    all_labels = []
    model.eval()
    with torch.no_grad():
        # this is a hold-k evaluation (where we hold k from training set and evaluate based on it )
        # till i parse the evaluation
        tokens = torch.load(f"./tokens_tensors/tokens_batch_{4}.pt",weights_only=True).type(torch.int)
        labels = torch.load(f"./tokens_tensors/tokens_batch_{4}.pt",weights_only=True).type(torch.int)
        eval_dataset = utils.SimpleDataset(tokens,labels)
        eval_data = DataLoader(eval_dataset,batch_size=1024)
        for inputs, labels in data:
            inputs, labels = inputs.to(device), labels.to(device)

                # Forward pass
            outputs = model(inputs)

            # Get predictions
            preds = torch.argmax(outputs, dim=2)  # Shape: (batch_size, seq_length, features) dim = 2 : features

            # Flatten predictions and labels
            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())
        accuracy = accuracy_score(all_labels, all_preds)

        print(f"epoch {epoch}'s Accuracy:", accuracy)

    scheduler.step()
    print(f"epoch {epoch}:, loss ={total_loss}")

epoch 0's Accuracy: 0.04274583333333333
epoch 0:, loss =779.0084149837494
epoch 1's Accuracy: 0.37904166666666667
epoch 1:, loss =509.5594679117203
epoch 2's Accuracy: 0.4665
epoch 2:, loss =203.79303359985352
epoch 3's Accuracy: 0.4982375
epoch 3:, loss =76.54760914295912
epoch 4's Accuracy: 0.5049958333333333
epoch 4:, loss =29.761675644665956
epoch 5's Accuracy: 0.5073208333333333
epoch 5:, loss =12.42368852160871
epoch 6's Accuracy: 0.5084125
epoch 6:, loss =5.88608543202281
epoch 7's Accuracy: 0.508675
epoch 7:, loss =3.167883210349828
epoch 8's Accuracy: 0.5088375
epoch 8:, loss =1.8957276444416493
epoch 9's Accuracy: 0.5088541666666667
epoch 9:, loss =1.2433592724846676
epoch 10's Accuracy: 0.5089583333333333
epoch 10:, loss =0.8689509643008932
epoch 11's Accuracy: 0.5089791666666666
epoch 11:, loss =0.6338998822611757
epoch 12's Accuracy: 0.5089833333333333
epoch 12:, loss =0.48225429040030576
epoch 13's Accuracy: 0.5089791666666666
epoch 13:, loss =0.37513921910431236
epoch 14

In [None]:
yarab = "i want four cheese pizzas and i want"
yarab = pd.Series(yarab)
yarab = utils.pre_text_normalization(yarab)
_, yarab = utils.tokenization(yarab)
yarab = yarab.map(convertor.word2id)
yarab = yarab.to_numpy()
plz = torch.from_numpy(yarab).to(device)
out_plz = model(plz)
out_plz = out_plz.argmax(dim =2)
for plz2 in out_plz[0]:
    print(convertor.id2label(plz2.item()))

none
none
number
topping
pizza
none
number
number
pizza
