In [16]:
import pandas as pd
import numpy as np
from num2words import num2words
from nltk.stem import SnowballStemmer
from nltk.tokenize import TreebankWordTokenizer
import re
import torch
import torch.nn as nn
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import Dataset, DataLoader
import torch.optim.lr_scheduler as lr_scheduler

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Read the Dataset given path to JSON file
# input: JSON file   -> output: list of size 4 (sentence, EXR, TOP, TOP_DECOUPLED) * number of strings
# we will use this to read the Training/ evaluation / test datasets. 
def read_dataset(path: str):
    data = pd.read_json(path, lines = True)
    columns = data.columns.tolist()
    parsed_json = [None]*len(columns)
    for i in range(0,len(columns)):
        parsed_json[i] = data[columns[i]] # IDK will it be easier to us to work with pandas or numpy
    return parsed_json # we store data in list of PD.Series for now

In [4]:
# the function takes SERIES of string sentences -> outputs SERIES of String sentences
def pre_text_normalization(sentences : pd.Series, flag = -1):
# Words to Lower
    # because i use this function in parsing the labels so i don't want it to affect
    if flag == -1:
        sentences = sentences.str.lower()
# SIZES
# after asking the TA, stating one format isn't a good idea so i won't standaradize the format
# so things like Party - size , party size to standaradize this i will just remove the '-'
    sentences = sentences.str.replace(r"-"," ",regex=True)
    sentences = sentences.str.replace(r"\s{2}",r" ",regex=True)
# sometimes they refer to pizza as pie : WE ONLY SELL PIZZA
    sentences = sentences.str.replace("pie", "pizza")

# Gluten - free we can leave it like that for now (may standardize it to gluten_free in future)

# Quantities
    '''
    Now i want to take into consider quantities that means less of topping
    something like 
    not much, not many, a little bit of, just a tiny bit of, just a bit, only a little, just a little, a bit, a little
    we will leave those quantities like that for now and see in the future if we will change them
    for quantities that mean much:
    a lot of, lots of
    '''

# Quantity like "a" pizza should be converted to "one" , only one, just one -> one
    sentences = sentences.str.replace(r"\ban?(?!\s+(bit|tiny|lot|little))\b","one",regex=True)
    sentences = sentences.str.replace(r"\b(?:only|just)\sone\b","one",regex=True)
    
# there are alot of Quantitative items 3 pies, Three pies ..
# normalize digits to words 
    sentences = sentences.str.replace(r"\b([0-9]+)\b",lambda match: num2words(int(match.group(1))),regex=True)
    
# Negation
    '''
    There is multiple ways of negation, what i found while searching:
    Without, hold the, With no(t), no, avoid, hate
    i want complex words like (hold the , without) to be converted int no
    we won't change those for now because i want to try learn the context of negation
    '''
# TOPPINGS 
# (I think BBQ topping needs to be paired with things, it's always written as bbq_chicken, bbq_sauce, bbq_pulled_pork...)
# i think this is oversimplification and i will let the sequence model decide this
# To be decided later

# DRINKS
# sometimes people say pepsi, sometimes pepsis so i don't want plurals -> let's stem
    sentences = sentences.str.replace(r"\b(\w\w+)e?s\b",r"\1",regex=True)
# sometimes san pellegrino is said pellegrino only
    sentences = sentences.str.replace(r"\bsan\s(pellegrino)\b",r"\1",regex=True)
# sometimes wrote zeros as zeroe
    sentences = sentences.str.replace(r"\b(zero)e\b",r"\1",regex=True)
# sometimes people write iced instead of ice
    sentences = sentences.str.replace(r"\b(ice)d\b",r"\1",regex=True)
# DOCTOR PEPPER convert dr to doctor , peper to pepper
    sentences = sentences.str.replace(r"\bdr\b",r"doctor",regex=True)
    sentences = sentences.str.replace(r"\bpeper\b",r"pepper",regex=True)
    
    return sentences

In [5]:
# Stemmer 
def snow_ball_stemmer(vocab):
    stemmer = SnowballStemmer("english")
    if isinstance(vocab,set):
        vocab = set([stemmer.stem(word) for word in vocab])
        return vocab
    else:
        vocab = vocab.apply(lambda words: [stemmer.stem(word) for word in words])
        return vocab

In [3]:
# the function takes SERIES of string sentences -> outputs SET of vocab and , SERIES of list of tokens
def tokenization(sentences: pd.Series):
    # merge the whole series int one sentence to make the vocab extracting faster
    all_words = ' '.join(sentences)
    # used penn treebank tokenizer
    tokenizer = TreebankWordTokenizer()
    all_words = tokenizer.tokenize(all_words)
    
    # keep the unique 
    vocab = set(all_words)
    
    sentences = sentences.apply(tokenizer.tokenize)

    sentences.fillna("",inplace=True)
    # convert tokenized_sentences into padded lists so that they have same dimension
    max_length = sentences.map(len).max()
    padded_tokenized_sentences = sentences.apply(lambda x: x + [np.nan] * (max_length - len(x)))
    padded_tokenized_sentences = pd.DataFrame(padded_tokenized_sentences.tolist())
    padded_tokenized_sentences.fillna(0,inplace = True)
   
   # negation check regex : \b(?<=not?)(.*?)(?=(\.|,|$|and))\b (for the future maybe ?)
    return vocab, padded_tokenized_sentences

In [7]:
def extract_pizza_drinks(parsed_tree: pd.Series): # the tree is a SERIES of format that is like this (ORDER (DRINK,))....

# i extract PIZZAORDER node if exist, and DRINKORDER node if exist
    
    pizza_orders, drink_orders = None, None
    
    
    order_pattern = r"(?<=\(ORDER)(.*)(?=\))"   
    # (ORDER i want to eat (PIZZAORDER)) This regex will extract i want to eat
    extracted_words_before_parsing = r"(?:(?:\(ORDER\s+)|(?:\)))([^()]+)(?=[\s(]+)"
    
    none_words = parsed_tree.str.extractall(extracted_words_before_parsing).iloc[:,0].str.strip()
    
    none_words = none_words.dropna().reset_index(drop=True)

    # remove the (ORDER and it's closing parenthesis at the end to ease next step by using order_pattern
    extracted_orders = parsed_tree.str.extractall(order_pattern).iloc[:,0].str.strip()

    # this regex leads to 2 capture groups : anything after PIZZAORDER and before ) and anything after DRINKORDER and before )
    # # match non capturing group (PIZZA ORDER someshit) if exist, and match non capturing group (DRINKORDER someshit) if exist
    # why non capturing? because i don't want the PD.extract to put it in the resulted Dataframe
    # in each group : search for (PIZZAORDER then space 0 more -it should be 1- then match "(" then
    # anything that isn't ")" one or more -words- then space 0 or more then ) then space 0 or more IF EXIST same for DRINK  
    pizza_drink_order_patterns = r" (?:\(PIZZAORDER\s*((?:\(?[^\)]+\)?\s*)*)\)\s*)?(?:\(DRINKORDER\s*((?:\(?[^\)]+\)?\s*)*)\)\s*)? "

    # This is a dataframe of two series where 0 : pizza orders , 1: drink order
    extracted_PIZZA_DRINK = parsed_tree.str.extractall(pizza_drink_order_patterns)
    
    pizza_orders = extracted_PIZZA_DRINK[0]
    
    drink_orders = extracted_PIZZA_DRINK[1]
    # remove the sentences where the user didn't order drinks
    drink_orders = drink_orders.dropna().reset_index(drop=True)
    # remove the sentences where the user didn't order pizzas
    pizza_orders = pizza_orders.dropna().reset_index(drop=True)
    # (PIZZAORDER pizza (TOPPING)) Now i want pizza to be captured
    # so i will encapsulate it with (NONE \w+), i will process this before training
    # so that words like pizza goes to pizza class, can goes to drink_container class
    pizza_orders = pizza_orders.replace(r"(?<=\))(.*?)(?=\()",r"(NONE \1)",regex=True)

    # because the previous pattern may match spaces between brackets as words
    # (TOPPing habd)\s(Topping habd2) it will make \s encapsulated and become (NONE \s)
    # so remove it here

    pizza_orders = pizza_orders.replace(r"\(NONE\s*\)", "",regex=True)
    # same for drink_orders
    drink_orders = drink_orders.replace(r"(?<=\))(.*?)(?=\()",r"(NONE \1)",regex=True)

    drink_orders = drink_orders.replace(r"\(NONE\s*\)", "",regex=True)
    # clean the ram 
    del extracted_orders
    del extracted_PIZZA_DRINK
    # return series of pizzaorders (TOPPING)(STYLE....), series of drinkorders of same format
    # series of none_words i, 'd, want, .... 
    return pizza_orders, drink_orders, none_words


In [8]:
# takes a pd.Series of format (TOPPING)(STYLE)...
# returns the words under every label
def extract_nodes(pizza_orders:pd.Series,drink_orders:pd.Series):
    drink_nodes, pizza_nodes = [] ,[]
    if np.any(pizza_orders) :
        pizza_node_attributes = ["NUMBER","SIZE","NONE","TOPPING","QUANTITY","STYLE"]
        for attribute in pizza_node_attributes:
            node_pattern = r"(?<=\("+attribute+r")(.*?)(?=\))"
            pizza_nodes.append(pizza_orders.str.extract(node_pattern))
            
    if np.any(drink_orders) :
        drink_node_attributes = ["NUMBER","SIZE","NONE", "DRINKTYPE","CONTAINERTYPE","VOLUME"]
        for attribute in drink_node_attributes:
            node_pattern = r"(?<=\("+attribute+r")(.*?)(?=\))"
            drink_nodes.append(drink_orders.str.extract(node_pattern))
    return pizza_nodes, drink_nodes

In [9]:
def clean_extracted_nodes(pizza_nodes: list[pd.Series], drink_nodes: list[pd.Series]):
    # i want to refine the extracted nodes since the one parsed from previous step has
    # alot of nans so i will drop those, normalize the text and drop the duplicates
    # after this step i can start labling the text
    new_pizza_nodes, new_drink_nodes = [], []
    for i in range(0,6):
        # 
        node = pizza_nodes[i].dropna().reset_index(drop=True)
        if i < 3: # There is SIZE, Number, None_Words for drink also so process on both
            node = pd.concat([node,drink_nodes[i].dropna().reset_index(drop=True)],axis =0, ignore_index=True)
        # convert the node from Dataframe of one series to one series 
        # so that we can use the series.str methods
        node = node.iloc[:,0]
        # if there was duplicates drop it to make normalization faster
        node.drop_duplicates(keep='first',inplace=True)
        
        node = pre_text_normalization(node)
        # after normalization duplicates will appear so delete them
        node.drop_duplicates(keep='first',inplace=True)
        
        node = node.reset_index(drop=True)
        # ensure there is no spaces
        node = node.str.strip()

        new_pizza_nodes.append(node)
    # we already processed on size, number, none words so process on the remaining
    for i in range(3,6):
        # same for Drinks
        node = drink_nodes[i].dropna().reset_index(drop=True)
        
        node = node.iloc[:,0]
        
        node.drop_duplicates(keep='first',inplace=True)
        
        node = pre_text_normalization(node)
        
        node.drop_duplicates(keep='first',inplace=True)
        
        node = node.reset_index(drop=True)
        
        node = node.str.strip()
        
        new_drink_nodes.append(node)

    return new_pizza_nodes, new_drink_nodes

In [10]:
def one_hot_encoding(vocab):
    unlabeled_vocab = vocab.to_numpy().reshape(-1,1)
    
    encoder = OneHotEncoder()
    
    encoder = encoder.fit(unlabeled_vocab)

    return encoder

In [None]:
def create_labeled_vocab(vocab: pd.DataFrame):
    if isinstance(vocab, type(None)):
        vocab = pd.read_csv("vocab.csv")
   
    # add unknown for future when testing eval set
   
    vocab.loc[-1] = "unk"

    vocab = vocab.reset_index(drop=True)

    vocab["1"] = "none"
    # because pizza, negation aren't put within () in preprocessing
    
    # i put them by myself to remove them from the None set
    labels = [None, None, None, None, None, None, None, None, None, pd.Series("pizza"), pd.Series(["hold","without","no","avoid","hate","ani"])]

    csv_file_names = ["number", "size", "none","topping","quantity","style","drink_type","container_type","volume"]

    for i, csv in zip(range(0,len(labels)), csv_file_names):
        labels[i] = pd.read_csv(f"./labels/{csv}.csv").iloc[:,0]
        labels[i] = labels[i].str.strip()
        
    for i in range(0,11):
        if i != 2:
            labels[2] = labels[2][~labels[2].isin(labels[i])]
            labels[2].dropna(inplace=True)
            labels[2] = labels[2].reset_index(drop=True)
    for i in range(1,11):
        labels[i] = labels[i][~labels[i].isin(labels[0])]

    csv_file_names.extend(["pizza","neg"])
    for i in range(0,11):
        vocab.loc[vocab["0"].isin(labels[i]),"1"] = csv_file_names[i]
 
    # returns vocab against labels
    vocab_encoder = one_hot_encoding(vocab[vocab.columns[0]])

# this will be as used as our target outputs 
    label_encoder = one_hot_encoding(pd.Series(csv_file_names))

    encoded_tokens = vocab_encoder.transform(vocab["0"].to_numpy().reshape(-1,1))
    encoded_labels = label_encoder.transform(vocab["1"].to_numpy().reshape(-1,1))
    vocab.rename(columns={"0": "tokens","1": "labels"},inplace=True)

    vocab["encoded_tokens"] = pd.Series([x.toarray().argmax(axis=1)[0] for x in encoded_tokens])
    vocab["encoded_labels_array"] = pd.Series([x.toarray()[0] for x in encoded_labels])
    vocab["encoded_labels"] = pd.Series([x.toarray().argmax(axis=1)[0] for x in encoded_labels])
    
    # write for future purposes instead of going through this loop again
    vocab.to_csv("labeled entities.csv",index=False)

    return vocab, vocab_encoder, label_encoder

In [None]:
class conversions():
    def __init__(self,vocab):
        self.token_to_id = dict(zip(vocab["tokens"], vocab["encoded_tokens"]))
        self.token_to_label = dict(zip(vocab["tokens"], vocab["encoded_labels_array"]))
        
        self.id_to_token = dict(zip(vocab["encoded_tokens"],vocab["tokens"]))
        self.id_to_label = dict(zip(vocab["encoded_labels"],vocab["labels"]))
        
    def word2id(self,word):
        return self.token_to_id.get(word, None)
    def word2labels(self,word):
        return self.token_to_label.get(word, np.zeros(11))
    def id2token(self,number):
        print(number)
        return self.id_to_token.get(int(number),"unk")
    def id2label(self,number):
        return self.id_to_label.get(number, None)

In [16]:
# we can use the DataSet class from pytorch to facilitate 
# batch divisions and data preparation
class SimpleDataset(Dataset):
    def __init__(self, input_indices, labels):
        self.input_indices = input_indices
        self.labels = labels

    def __len__(self):
        return len(self.input_indices)

    def __getitem__(self, idx):
        return self.input_indices[idx], self.labels[idx]

In [13]:
def to_pass_size_as_arg(size):
    def integer_to_one_hot(index):
        # Create a zero vector of length num_classes
        one_hot = torch.zeros(size).type(torch.float32)
        # Set the position corresponding to the index to 1
        one_hot[int(index)] = 1
        return one_hot
    return integer_to_one_hot

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, num_classes, hidden_size):
        super(RNN,self).__init__()
        self.embedding = nn.Embedding(num_embeddings=input_size,embedding_dim=100).to(device)
        self.input_size = input_size
        self.hidden_size = hidden_size
        # batch_first = True means that batch is the first dimension
        # shape : batch_first, seq, input_size
        self.lstm = nn.LSTM(input_size=100,hidden_size=hidden_size, batch_first=True).to(device)
        # linear layer : from hidden RNN to Output
        self.fc = nn.Linear(hidden_size, num_classes).to(device)

    def forward(self, input):
        # input is batch, seq cuz it's integer indices
        mask_condition = input >0
        input = input *mask_condition
        
        embed = self.embedding(input)
        # for LSTM we need initial tensor state + initial hidden state
        # where initial tensor state is called (cell)
        # 1 : num of layers , batch size , hidden_size
        c_0 = torch.zeros(1,input.size(0),self.hidden_size).to(device)
        h_0 = torch.zeros(1,input.size(0),self.hidden_size).to(device)
        # output of self.rnn : out feature, hidden_state(n)
        out, _ = self.lstm(embed,(h_0,c_0))
        # size of output = batch, seq_length, hidden_size
        out = self.fc(out)
        probabilities = torch.softmax(out, dim=2)
        
        return probabilities