In [1]:
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
# Show all rows
pd.set_option('display.max_rows', None)

# Show all columns
pd.set_option('display.max_columns', None)

In [3]:
# Read the Dataset given path to JSON file
# input: JSON file   -> output: list of size 4 (sentence, EXR, TOP, TOP_DECOUPLED) * number of strings
# we will use this to read the Training/ evaluation / test datasets. 
def read_dataset(path: str):
    data = pd.read_json(path, lines = True)
    columns = data.columns.tolist()
    parsed_json = [None]*len(columns)
    for i in range(0,len(columns)):
        parsed_json[i] = data[columns[i]] # IDK will it be easier to us to work with pandas or numpy
    return parsed_json # we store data in list of PD.Series for now

In [4]:
# the function takes SERIES of string sentences -> outputs SERIES of String sentences
def text_normalization(sentences):
# convert words to lower
    sentences = sentences.str.lower()
# sizes are flexible some are party-sized, party size , lunch - sized ....
# i will assume one format A-B : party-sized
# now we build our regex where we search for "(some shit) size" and convert it to some shit-size 
    pattern_size =r'''(?xm) # multiline verbose flag to enable comments
        \b([a-z]*)[-\s]*(size)(?:d)?\b # i search for alphabets that is followed by one or more spaces or "-" then one or more spaces
                                       # then size, not capturing d because i don't want it anymore (i don't want it in group actually)
        '''
    sentences = sentences.str.replace(pattern_size, r"\1-\2",regex=True)
# sometimes they ask the pizza to be gluten-free, gluten free -> normalizing this to gluten-free
# gluten-free is a complete style so i don't think that this will be removed
    pattern_free = r'''(?xm) 
        \b([a-z]*)[-\s]*(free)\b
        '''
    sentences = sentences.str.replace(pattern_free, r"\1-\2",regex=True)
# sometimes they refer to pizza as pie : WE ONLY SELL PIZZA
    sentences = sentences.str.replace("pie", "pizza")
# Quantities

# there are alot of Quantitative items 3 pies, 250 ml, 552 ....
# we can normalizing this to a quantity flag something like <Q>
    pattern_digits = r"[0-9]+"
# we make the flag as lowercase "quta" short for quantity 
    sentences = sentences.str.replace(pattern_digits, "quta", regex=True)    
    '''
    Now i want to take into consider quantities that means less of topping
    something like not much not many .... we convert all those to a standard quantity : light
    little bit of, not much, not many, just a bit, just a little, tiny bit
    '''
    less_quantities = [r"\blittle\sbit\sof\b",r"\bnot\sm(?:uch|any)\b",r"\bjust\sa\s(bit|little)\b",r"\btiny\sbit\b"]
    for word in less_quantities:
        sentences = sentences.str.replace(word,"light",regex=True)
    '''
    for quantities that mean much:
    a lot of.
    '''
    more_quantities =[r"\ba\slot\sof\b"] # any extra that doesn't have large after
    for word in more_quantities:
        sentences = sentences.str.replace(word,"extra",regex=True)
# Quantity like "a" pizza should be converted to "one"
    sentences = sentences.str.replace(r"\ba\b","one",regex=True)
# Negation
    '''
    There is multiple ways of negation, what i found while searching:
    Without, hold the, With no(t), no, avoid
    i want complex words like (hold the , without) to be converted int no
    i will use no to negate the whole toppings in the tokenizer
    '''
    negation_words = [r"\bwithout\b", r"\bhold\sthe\b", r"\bavoid\b"]
    for word in negation_words:
        sentences = sentences.str.replace(word, "no" ,regex=True)


# we may add more Normalization Techniques or delete some "WHO KNOWS" 
#TO DO: (I think BBQ topping needs to be paired with things, it's always written as bbq_chicken, bbq_sauce, bbq_pulled_pork... but i don't know for now)
    return sentences

In [5]:
# the function takes SERIES of string sentences -> outputs SET of vocab and , SERIES of list of tokens
def tokenization(sentences):
    # merge the whole series int one sentence to make the vocab extracting faster
    all_words = ' '.join(sentences)
    # split on \s
    all_words = all_words.split(r" ")
    # keep the unique 
    vocab = set(all_words)
    # i want to tokenize things like I'd to I , 'd
    # new tokens that will come from I'd, it's ,....
    new_tokens = set()
    for word in vocab:
        temp2 = word.split("'")
        # to make sure the 2nd token has its apostrophe: 'd (it should be two splitted words)
        temp2[-1] = "'"+temp2[-1]
        new_tokens.update(temp2)
    vocab.update(new_tokens)
    # we use expand to split the series into Dataframe (I think this will accelerate when i try to map the word into other thing)
    pattern = r"\b([a-z]*)'([a-z]*)\b"
    sentences = sentences.str.replace(pattern, r"\1 '\2",regex=True)
    sentences = sentences.str.split(" ",expand=True)
    sentences.fillna("",inplace=True)
   # negation check regex : \b(?<=not?)(.*?)(?=(\.|,|$|and))\b (for the future maybe ?)
    return vocab, sentences

In [5]:
sentences, parsed_tree, structured_sentence, decoupled_structured_sentence = read_dataset("./PIZZA_train.json")

In [7]:
'''
now we talk about the data preprocessing
What we should take into consideration? 
1- Word Normalization
2- Word Tokenization
Why we won't use Sentence segmentation? It's useless, orders are one sentence question no clear punctuation exist
'''
# NORMALIZATION
normalized_sentence = text_normalization(sentences.copy())

In [8]:
# TOKENIZATION
# INITIAL tokenization we may need better implementations
vocab, tokenized_sentence = tokenization(normalized_sentence.copy())


In [3]:
parsed_tree = pd.read_csv("EXR.csv")

In [38]:
# in order to extract every node i will walk on string and match every (NODE)
# problem with that is regex won't know what is the starting and ending of parenthesis 
# so i will extract ORDER from (ORDER(PIZZAORDER () )(DRINKORDER () )) then DRINKORDER then PIZZAORDER
# this way DRINKORDER sees only 2 parenthesis (DRINKORDER .*) and after this part is deleted PIZZAORDER won't have a problem
# thus i can just match every 2 parenthesis (.*) 
order, pizza_order, drink_order, nodes_list = None, None, None, []
order_pattern = r"(?<=\(ORDER).*(?=\))"
drink_order_pattern = r"(\(DRINKORDER).*(?=\))"
node_pattern = r"(?<=\().*?(?=\))"

x= "(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING BBQ_PULLED_PORK ) )(DRINKORDER (NUMBER 2)))"
order = re.search(re.compile(order_pattern),x).group()
order = order.strip()
drink_order = re.search(re.compile(drink_order_pattern),order)
if(drink_order != None):
    pizza_order = order[1:drink_order.span()[0]]
    drink_order = drink_order.group()
    print(pizza_order)
else:
    pizza_order = order
nodes = re.finditer(re.compile(node_pattern), pizza_order)
if nodes != None:
    nodes_list.extend([x.group() for x in nodes]) # will get changed
print(nodes_list)
# nodes can be :
# for PIZZAORDER: NUMBER, SIZE, TOPPING, COMPLEX_TOPPING, NOT
# for DRINKORDER: DRINKTYPE, CONTAINERTYPE, VOLUME
# tomorrow


PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING BBQ_PULLED_PORK ) )
['NUMBER 1 ', 'SIZE LARGE ', 'TOPPING BBQ_PULLED_PORK ']
