In [2]:
import utils
import gc
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch.optim.lr_scheduler as lr_scheduler
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import MWETokenizer
from nltk import pos_tag
import importlib
import re
from word2number import w2n
import pickle
importlib.reload(utils)
import nltk

# STEP 1 READ THE DATA
read_dataset: takes path to JSON file that has sentences, _.EXR, _.TOP, _.TOP_DECOUPLED
and returns them as pandas.Series

In [14]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
sentences, parsed_tree, structured_sentence, decoupled_structured_sentence = utils.read_dataset("./PIZZA_train.json")

In [4]:
# don't need them so free the data
del parsed_tree
del decoupled_structured_sentence
del structured_sentence
gc.collect()

0

# STEP 2: Parse data and extract labels
In this step we build our Multiword expressions, extract the labels of every token

In [3]:
# do this if u didn't read the pizza_train.json above
structured_sentence = pd.read_csv("TOP.csv")

In [4]:
# do this if u read the TOP.csv
structured_sentence = structured_sentence.iloc[:,0]

In [5]:

pizza_orders, drink_orders, none_words = utils.extract_pizza_drinks(structured_sentence.copy())

none_words = none_words.drop_duplicates()

none_words = utils.pre_text_normalization(none_words)

none_words = none_words.reset_index(drop=True)

nones, _ = utils.tokenization(none_words)


In [14]:
# none words have a problem
# words like pizza, Negations are in this set -> extract them
pizza_class = "pizza"
negation_class = ["hold", "avoid", "hate", "without", "no","not"]
if pizza_class in nones:
    nones.remove(pizza_class)
for word in negation_class:
    if word in nones:
        nones.remove(word)
# here we save this nones, pizza, negation_classes
pd.Series(negation_class).to_csv(f"./labels/negation.csv", index=False)

In [7]:
pizza_nodes, drink_nodes = utils.extract_nodes(pizza_orders,drink_orders)
pizza_nodes, drink_nodes = utils.clean_extracted_nodes(pizza_nodes, drink_nodes)

In [8]:
pizza_number, pizza_size, topping , quantity, style = pizza_nodes
drink_number, drink_size, drink_type, container_type, volume = drink_nodes
number = pd.concat([pizza_number,drink_number])
size = pd.concat([pizza_size,drink_size])
number.drop_duplicates(inplace=True)
size.drop_duplicates(inplace=True)


In [9]:
# clean the nodes to contain only the meaningful words
vocab, _ = utils.tokenization(number)
number_vocab = set()
for word in vocab:
    try:
        _ = w2n.word_to_num(word)
        number_vocab.add(word)
    except ValueError:
        nones.add(word)
number = pd.Series(list(number_vocab))

In [12]:
vocab, _ = utils.tokenization(volume)
volume_vocab = set()
for word in vocab:
    try:
        _ = w2n.word_to_num(word)
        number[-1] = word
        number.reset_index(drop=True,inplace=True)
        number.drop_duplicates(inplace=True)
    except ValueError:
        volume_vocab.add(word)
volume = pd.Series(list(volume_vocab))

In [13]:
# i want to remove extra from size
size_vocab, _ = utils.tokenization(size)
topping_vocab, _ = utils.tokenization(topping)
quantity_vocab, _ = utils.tokenization(quantity)
style_vocab, _ = utils.tokenization(style)


In [15]:
for style in style_vocab.copy():
    if style in topping_vocab:
        style_vocab.remove(style)
    if style in nones:
        style_vocab.remove(style)
for size in size_vocab.copy():
    if size in quantity_vocab:
        size_vocab.remove(size)
nones.update({"only", "just","a","of"})
quantity_vocab.remove("only")
quantity_vocab.remove("just")
quantity_vocab.remove("a")
quantity_vocab.remove("of")
quantity_vocab.remove("not")

In [16]:
size = pd.Series(list(size_vocab))
quantity = pd.Series(list(quantity_vocab))
style = pd.Series(list(style_vocab))
topping = pd.Series(list(topping_vocab))

In [17]:
# only for drink types i will use multiword tokenizer 
# from where will he invent new drink ?
drink_type_vocab, tokens = utils.tokenization(drink_type)
mwe =[]
vocab = set()
for col in tokens.columns:
        tokens.loc[tokens[col] == 0,col] = ""
tokens = tokens.to_numpy().tolist()
for i,token_list in enumerate(tokens):
        while "" in token_list:
                token_list.remove("")
        if len(token_list) == 1:
                vocab.add(token_list[0])
        else:
                mwe.append(tuple(token_list))
                string = "_".join(token_list)
                string = re.sub(r"_+$","",string)
                vocab.add(string)
drink_type_vocab = vocab

In [18]:
tokenizer = MWETokenizer(mwe)
with open("DRINK_MWE_TOKENS.pkl", 'wb') as file:
    pickle.dump(tokenizer,file)

In [20]:
container_vocab, _ = utils.tokenization(container_type)
container_vocab.remove("a")
container_vocab.remove("in")
nones.add("in")
container_type = pd.Series(list(container_vocab))
drink_type = pd.Series(list(drink_type_vocab))


In [21]:
volume_vocab = pd.Series(list(volume_vocab))
volume_vocab.to_csv(f"./labels/volume.csv", index=False)
nones = pd.Series(list(nones))
labels = [number, size, nones, topping, quantity, style, drink_type, container_type, volume]
csv_file_names = ["number", "size", "none","topping","quantity","style","drink_type","container_type"]
for vocab, csv in zip(labels,csv_file_names):
    vocab.to_csv(f"./labels/{csv}.csv", index=False)

In [22]:

labels = [number, size, nones, topping, quantity, style, drink_type, container_type, volume]
csv_file_names = ["number", "size", "none","topping","quantity","style","drink_type","container_type"]
# merge nones that came from inside the PIZZAORDER, DRINKORDER and what u got from outside them
none_vocab = nones
mwe =[]
for label, csv in zip(labels, csv_file_names):
    if csv != "none":
        _, tokens = utils.tokenization(label)
        tokens.drop_duplicates(inplace=True)
        vocab = set()
        for col in tokens.columns:
            tokens.loc[tokens[col] == 0,col] = ""
        tokens = tokens.to_numpy().tolist()
        for i,token_list in enumerate(tokens):
            while "" in token_list:
                token_list.remove("")
            if len(token_list) != 1:
                mwe.append(tuple(token_list))
                string = " ".join(token_list)
                string = re.sub(r"\s$","",string)
                


In [23]:
# save the state of the MWETOKENIZER to be used when processing the sentences
tokenizer = MWETokenizer(mwe)
with open("MWE_TOKENS.pkl", 'wb') as file:
    pickle.dump(tokenizer,file)


In [24]:
del vocab
del labels
del csv_file_names
del pizza_nodes
del pizza_orders
del drink_nodes
del drink_orders


# STEP 3 : preprocess data
##### What we should take into consideration? 
1- Word Normalization  
2- Word Tokenization  
Why we won't use Sentence segmentation?  
It's useless, orders are one sentence question no clear punctuation exist

In [5]:
# NORMALIZATION
normalized_sentence = utils.pre_text_normalization(sentences.copy())

In [6]:
normalized_sentence.drop_duplicates(inplace=True)
normalized_sentence.shape

(1293560,)

In [7]:
# tokenization
vocab, _ = utils.tokenization(normalized_sentence,tokenizesentences=1)


In [8]:
# sampling 20 sentence for every word in vocab, I hope this makes acceptable result
# the sampling will take 2 - 4 minutes (225 word * sample cost)
sampled_normalized_sentences = pd.Series()
for word in vocab:
    x = word
    sampled = normalized_sentence[normalized_sentence.str.contains(x)].to_numpy()
    np.random.shuffle(sampled)
    random_sample = sampled[:20]
    series = pd.Series(random_sample)
    sampled_normalized_sentences = pd.concat([sampled_normalized_sentences,series],axis=0)

sampled_normalized_sentences.drop_duplicates(inplace=True)
sampled_normalized_sentences.shape

(4214,)

In [9]:
_, tokenized_sentences = utils.tokenization(sampled_normalized_sentences, tokenizesentences=1)

In [10]:
# checkpoint: Normalization and tokenization of sentences
vocab_as_series = pd.Series(list(vocab))
vocab_as_series.to_csv("vocab.csv",index=False)
tokenized_sentences.to_csv("tokenized_sentences.csv",index=False)

In [37]:
# to run pos_tag you need to download the nltk_data first -> nltk.download()
sentence = tokenized_sentences.iloc[1]
sentence.replace(["0",0],"PAD",inplace=True)
pos_tag(["i","want","one","large","pizza"])

[('i', 'NN'), ('want', 'VBP'), ('one', 'CD'), ('large', 'JJ'), ('pizza', 'NN')]

In [38]:
del vocab
del tokenized_sentences

# STEP 4: Encode The tokens and label them

In [3]:
vocab, vocab_encoder, label_encoder = utils.create_labeled_vocab(None)

In [4]:
# made sure that the encoding is correct
print(vocab_encoder.categories_[0][([vocab[vocab["tokens"] == "bbq"].loc[vocab[vocab["tokens"] == "bbq"].index[0],"encoded_tokens"]])])
print(label_encoder.categories_[0][([vocab[vocab["tokens"] == "bbq"].loc[vocab[vocab["tokens"] == "bbq"].index[0],"encoded_labels"]])])


['bbq']
['topping']


In [6]:
tokenized_sentences = pd.read_csv("tokenized_sentences.csv")
tokenized_sentences.replace([0,"0"],"PAD",inplace=True)

In [7]:
convertor = utils.conversions(vocab)

In [8]:
tokens_as_ids = tokenized_sentences.map(convertor.word2id)

In [9]:
labels_as_ids = tokenized_sentences.map(convertor.word2labels)


In [10]:
tokens_as_ids.replace(1,0,inplace=True)

In [11]:
# لخبطة المودل هغير كام كلمة واحطهم ب UNK
unknown_id = convertor.word2id("UNK")
words_for_perplexion = ["ounce","peppperoni","fourteen","napolitana","broccoli"]
for i, word in enumerate(words_for_perplexion):
    words_for_perplexion[i] = convertor.word2id(word)
    tokens_as_ids.replace(words_for_perplexion[i],unknown_id,inplace=True)

In [12]:
# from here on screw Pandas we only work with numpy, and tensors
tokens_ids_as_numpy = tokens_as_ids.to_numpy()
tokens_labels_as_numpy = labels_as_ids.to_numpy()

In [13]:
del tokenized_sentences
del tokens_as_ids
del labels_as_ids
gc.collect()

1997

In [14]:
# check i did encode correctly:
encode_test = tokens_ids_as_numpy[0][~np.isnan(tokens_ids_as_numpy[0])] 
series = vocab[vocab["tokens"] == vocab_encoder.categories_[0][int(tokens_ids_as_numpy[0][3])] ]

index = series.index[0]

series1, series2 = series.loc[index,"encoded_labels"] , series.loc[index,"encoded_tokens"] 

print(vocab_encoder.categories_[0][series2])
print(label_encoder.categories_[0][series1])
# print(len(tokenized_sentences.loc[0]) == len(tokenized_sentences.loc[0]))


with
none


In [15]:
# the memory will be bad when we transform the numpy to tensor
# we need to split them and save them on disk so we can load batches when we train
tokens_tensor = torch.from_numpy(tokens_ids_as_numpy).type(torch.float32)
torch.save(tokens_tensor,f"./tokens_tensors.pt")
labels_tensors = torch.from_numpy(tokens_labels_as_numpy).type(torch.float32)
torch.save(labels_tensors,f"./labels_tensors.pt")

# STEP 5: MODEL

In [16]:
print(vocab.shape[0])

226


In [17]:
# determine what to run my tensors on
device = torch.device("cuda" if  torch.cuda.is_available() else "cpu")
# my one hot encoders DETERMINSTIC ON WHAT CRITERIA I WILL TRAIN ON
input_size = vocab.shape[0]
# this is a parameter 
hidden_size = 512
# output : num of classes
num_classes = 12
# num of trials (epochs)
epochs = 20
# Batch size = ? 
batch_size = 8
# learning_rate
lr = 0.5

# num_layers in RNN default is 1 (increasing layers improve result but worsen the time)

In [39]:

# What is the model i will use
model = utils.RNN(input_size,num_classes,hidden_size)
# loss criteria here i use CEloss
loss_criterion = utils.nn.CrossEntropyLoss(ignore_index=-1)
# stochastic gradient descent
optimizer = torch.optim.SGD(model.parameters(),lr, momentum=0.9)
# they say decaying learning rate is better than fixed one so i will use learning rate scheduler
lambdalr = lambda epoch: epoch / 10
scheduler = lr_scheduler.LambdaLR(optimizer,lambdalr)


In [40]:

tokens = torch.load(f"./tokens_tensors.pt",weights_only=True).type(torch.int64)
labels = torch.load(f"./labels_tensors.pt",weights_only=True).type(torch.int64)

dataset = utils.SimpleDataset(tokens,labels)
        
        
data = DataLoader(dataset,batch_size=batch_size,shuffle = True)

In [41]:

for epoch in range(epochs):
    total_loss=0
    model.train()

    for input_tensors, label_tensors in data:
            
        input_tensors = input_tensors.to(device)
            
        label_tensors = label_tensors.to(device)
           
        out_tensor = model(input_tensors)
        
        loss = loss_criterion(out_tensor.view(-1,out_tensor.shape[-1]),label_tensors.view(-1))
            
        optimizer.zero_grad()
        
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)
        optimizer.step()

        # Here should be the evaluation after every epoch
        # no grad so that pytorch doesn't insert it in his calculations
    all_preds = []
    all_labels = []
    model.eval()
    with torch.no_grad():
        # this is a hold-k evaluation (where we hold k from training set and evaluate based on it )
        # till i parse the evaluation
        tokens = torch.load(f"./tokens_tensors/tokens_batch_{0}.pt",weights_only=True).type(torch.int)
        labels = torch.load(f"./labels_tensors/labels_batch_{0}.pt",weights_only=True).type(torch.int)
        eval_dataset = utils.SimpleDataset(tokens,labels)
        eval_data = DataLoader(eval_dataset,batch_size=1024)
        for inputs, labels in eval_data:
            inputs, labels = inputs.to(device), labels.to(device)

                # Forward pass
            outputs = model(inputs)

            # Get predictions
            preds = torch.argmax(outputs, dim=-1)  # Shape: (batch_size, seq_length, features) dim = 2 == -1 : features

            # Flatten predictions and labels
            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())
        
        accuracy = accuracy_score(all_labels, all_preds)

        print(f"epoch {epoch}'s Accuracy:", accuracy)

    scheduler.step()
    print(f"epoch {epoch}:, loss ={total_loss}")

epoch 0's Accuracy: 0.09057413476086765
epoch 0:, loss =5307.789555549622
epoch 1's Accuracy: 0.12119454225949372
epoch 1:, loss =163.5378072372987
epoch 2's Accuracy: 0.12000668308602523
epoch 2:, loss =3.20824131160407
epoch 3's Accuracy: 0.1176818511809047
epoch 3:, loss =1.2558851849371422
epoch 4's Accuracy: 0.11607333075508695
epoch 4:, loss =1.6797350928343349
epoch 5's Accuracy: 0.1033498544647764
epoch 5:, loss =2.146426518692806
epoch 6's Accuracy: 0.11548677970241608
epoch 6:, loss =0.9004268102767128
epoch 7's Accuracy: 0.1113156180667223
epoch 7:, loss =3.05394951676044


KeyboardInterrupt: 

In [32]:
yarab = "one pepsi in a bottle and also one UNK sauce pizza"
yarab = pd.Series(yarab)
yarab = utils.pre_text_normalization(yarab)
_, yarab = utils.tokenization(yarab)
print( "word after tokenization:\n", list(yarab.iloc[0]))
yarab = yarab.map(convertor.word2id)
yarab = yarab.to_numpy()
plz = torch.from_numpy(yarab).to(device)
out_plz = model(plz)
out_plz = out_plz.argmax(dim =2)
for plz2 in out_plz[0]:
    print(convertor.id2label(plz2.item()),end=" ")

word after tokenization:
 ['one', 'pepsi', 'in', 'a', 'bottle', 'and', 'also', 'one', 'unk', 'sauce', 'pizza']
number drink_type none none container_type none none number volume style pizza 