In [None]:
import pandas as pd
import numpy as np
import nltk
import re

In [337]:
# Show all rows
pd.set_option('display.max_rows', None)

# Show all columns
pd.set_option('display.max_columns', None)

In [None]:
# Read the Dataset given path to JSON file
# input: JSON file   -> output: list of size 4 (sentence, EXR, TOP, TOP_DECOUPLED) * number of strings
# we will use this to read the Training/ evaluation / test datasets. 
def read_dataset(path: str):
    data = pd.read_json(path, lines = True)
    columns = data.columns.tolist()
    parsed_json = [None]*len(columns)
    for i in range(0,len(columns)):
        parsed_json[i] = data[columns[i]] # IDK will it be easier to us to work with pandas or numpy
    return parsed_json # we store data in list of PD.Series for now

In [339]:
# the function takes SERIES of string sentences -> outputs SERIES of String sentences
def text_normalization(sentences):
# convert words to lower
    sentences = sentences.str.lower()
# sizes are flexible some are party-sized, party size , lunch - sized ....
# i will assume one format A-B : party-sized
# now we build our regex where we search for "(some shit) size" and convert it to some shit-size 
    pattern_size =r'''(?xm) # multiline verbose flag to enable comments
        \b([a-z]*)[-\s]*(size)(?:d)?\b # i search for alphabets that is followed by one or more spaces or "-" then one or more spaces
                                       # then size, not capturing d because i don't want it anymore (i don't want it in group actually)
        '''
    sentences = sentences.str.replace(pattern_size, r"\1_\2",regex=True)
# sometimes they ask the pizza to be gluten-free, gluten free -> normalizing this to gluten-free
# gluten-free is a complete style so i don't think that this will be removed
    pattern_free = r'''(?xm) 
        \b([a-z]*)[-\s]*(free)\b
        '''
    sentences = sentences.str.replace(pattern_free, r"\1_\2",regex=True)
# sometimes they refer to pizza as pie : WE ONLY SELL PIZZA
    sentences = sentences.str.replace("pie", "pizza")
# Quantities
    '''
    Now i want to take into consider quantities that means less of topping
    something like not much not many .... we convert all those to a standard quantity : light
    not much, not many, a little bit of, just a tiny bit of, just a bit, only a little, just a little, a bit, a little
    '''
    less_quantities = [r"\bnot\sm(?:uch|any)\b",r"\b(only\s|just\s)?a\slittle(\sbit\sof)?\b"
                       ,r"\b(just\s)?a\s(tiny\s)?bit(\sof)?\b"]
    for word in less_quantities:
        sentences = sentences.str.replace(word,"light",regex=True)
    '''
    for quantities that mean much:
    a lot of, lots of
    '''
    more_quantities =[r"\ba\slot\sof\b", r"\blots\sof\b"] # any extra that doesn't have large after
    for word in more_quantities:
        sentences = sentences.str.replace(word,"extra",regex=True)
# Quantity like "a" pizza should be converted to "one" , only one, just one -> one
    sentences = sentences.str.replace(r"\ban?\b","one",regex=True)
    sentences = sentences.str.replace(r"\b(?:only|just)\sone\b","one",regex=True)
    
# there are alot of Quantitative items 3 pies, Three pies ..
# normalize words to digits only
    word_to_num = {
    r"\bone\b": "1", r"\btwo\b": "2", r"\bthree\b": "3", r"\bfour\b": "4", r"\bfive\b": "5",
    r"\bsix\b": "6", r"\bseven\b": "7", r"\beight\b": "8", r"\bnine\b": "9", r"\bten\b": "10",
    r"\beleven\b": "11", r"\btwelve\b": "12", r"\bthirteen\b": "13", r"\bfourteen\b": "14", r"\bfifteen\b": "15",
    r"\bsixteen\b": "16"
    }
    sentences = sentences.replace(regex=[key for key in word_to_num.keys()],value=[value for value in word_to_num.values()])


# Negation
    '''
    There is multiple ways of negation, what i found while searching:
    Without, hold the, With no(t), no, avoid
    i want complex words like (hold the , without) to be converted int no
    i will use no to negate the whole toppings in the tokenizer
    '''
    negation_words = [r"\bwithout\b", r"\bhold\sthe\b", r"\bavoid\b"]
    for word in negation_words:
        sentences = sentences.str.replace(word, "no" ,regex=True)
# DRINKS
# sometimes people say pepsi, sometimes pepsis so i don't want plurals -> let's stem
    sentences = sentences.str.replace(r"\b(\w\w+)e?s\b",r"\1",regex=True)
# sometimes san pellegrino is said pellegrino only
    sentences = sentences.str.replace(r"\bsan\s(pellegrino)\b",r"\1",regex=True)
# sometimes wrote zeros as zeroe
    sentences = sentences.str.replace(r"\b(zero)e\b",r"\1",regex=True)
# sometimes people write iced instead of ice
    sentences = sentences.str.replace(r"\b(ice)d\b",r"\1",regex=True)
# DOCTOR PEPPER convert dr to doctor , peper to pepper
    sentences = sentences.str.replace(r"\bdr\b",r"doctor",regex=True)
    sentences = sentences.str.replace(r"\bpeper\b",r"pepper",regex=True)
# in a can, in can can they are all the same
    sentences = sentences.str.replace(r"\bin\s(1\s)?can\b","can",regex=True)
    sentences = sentences.str.replace(r"\bin\s(1\s)?bottle\b","bottle",regex=True)
# volume quantities : 200-milliliter -> i want it 200 ml
    sentences = sentences.str.replace(r"\b([0-9]+)\s*-\s*(\w+)\b",r"\1 \2",regex=True)
# we may add more Normalization Techniques or delete some "WHO KNOWS" 
#TO DO: 
# (I think BBQ topping needs to be paired with things, it's always written as bbq_chicken, bbq_sauce, bbq_pulled_pork...)
# i think this is oversimplification and i will let the sequence model decide this
# To be decided later
    return sentences

In [340]:
# the function takes SERIES of string sentences -> outputs SET of vocab and , SERIES of list of tokens
def tokenization(sentences):
    # merge the whole series int one sentence to make the vocab extracting faster
    all_words = ' '.join(sentences)
    # split on \s
    all_words = all_words.split(r" ")
    # keep the unique 
    vocab = set(all_words)
    # i want to tokenize things like I'd to I , 'd
    # new tokens that will come from I'd, it's ,....
    new_tokens = set()
    for word in vocab:
        temp2 = word.split("'")
        # to make sure the 2nd token has its apostrophe: 'd (it should be two splitted words)
        temp2[-1] = "'"+temp2[-1]
        new_tokens.update(temp2)
    vocab.update(new_tokens)
    # we use expand to split the series into Dataframe (I think this will accelerate when i try to map the word into other thing)
    pattern = r"\b([a-z]*)'([a-z]*)\b"
    sentences = sentences.str.replace(pattern, r"\1 '\2",regex=True)
    sentences = sentences.str.split(" ",expand=True)
    sentences.fillna("",inplace=True)
   # negation check regex : \b(?<=not?)(.*?)(?=(\.|,|$|and))\b (for the future maybe ?)
    return vocab, sentences

In [341]:
def extract_pizza_drinks(parsed_tree): # the tree is a SERIES of format that is like this (ORDER (DRINK,))....
# i extract PIZZAORDER node if exist, and DRINKORDER node if exist
    pizza_orders, drink_orders = None, None
    # remove the (ORDER and it's closing parenthesis at the end to ease next step
    order_pattern = r"(?<=\(ORDER)(.*)(?=\))"
    # this regex leads to 2 capture groups : anything after PIZZAORDER and before ) and anything after DRINKORDER and before )  
    pizza_drink_order_patterns = r" (?:\(PIZZAORDER\s*((?:\([^\)]+\)\s*)*)\)\s*)?(?:\(DRINKORDER\s*((?:\([^\)]+\)\s*)*)\))? "
# match non capturing group (PIZZA ORDER someshit) if exist, and match non capturing group (DRINKORDER someshit) if exist
# why non capturing? because i don't want the PD.extract to put it in the resulted Dataframe
# in each group : search for (PIZZAORDER then space 0 more -it should be 1- then match "(" then
# anything that isn't ")" one or more -words- then space 0 or more then ) then space 0 or more IF EXIST same for DRINK
    extracted_orders = parsed_tree.str.extract(order_pattern).iloc[:,0].str.strip()
    extracted_PIZZA_DRINK = parsed_tree.str.extract(pizza_drink_order_patterns)
    drink_orders = extracted_PIZZA_DRINK[1]
    pizza_orders = extracted_PIZZA_DRINK[0]
    drink_orders = drink_orders.dropna().reset_index(drop=True)
    pizza_orders = pizza_orders.dropna().reset_index(drop=True)
    del extracted_orders
    del extracted_PIZZA_DRINK
    return pizza_orders, drink_orders


In [342]:
def extract_nodes(pizza_orders,drink_orders):
    pizza_nodes = []
    pizza_node_attributes = ["NUMBER","SIZE","TOPPING","QUANTITY"]
    for attribute in pizza_node_attributes:
        node_pattern = r"(?<=\("+attribute+r")(.*?)(?=\))"
        pizza_nodes.append(pizza_orders.str.extract(node_pattern))
    drink_node_attributes = ["NUMBER","SIZE","DRINKTYPE","CONTAINERTYPE","VOLUME"]
    drink_nodes = []
    for attribute in drink_node_attributes:
        node_pattern = r"(?<=\("+attribute+r")(.*?)(?=\))"
        drink_nodes.append(drink_orders.str.extract(node_pattern))
    return pizza_nodes, drink_nodes

In [343]:
def clean_extracted_nodes(pizza_nodes, drink_nodes):
    # i want to refine the extracted nodes since the one parsed from previous step has
    # alot of nans so i will drop those, normalize the text and drop the duplicates
    # after this step i can start labling the text
    new_pizza_nodes, new_drink_nodes = [], []
    for i in range(0,4):
        node = pizza_nodes[i].dropna().reset_index(drop=True)
        if i < 2: # for size and number
            node = pd.concat([node,drink_nodes[i].dropna().reset_index(drop=True)],axis =0, ignore_index=True)
        node = node.iloc[:,0]
        node.drop_duplicates(keep='first',inplace=True)
        node = text_normalization(node)
        node.drop_duplicates(keep='first',inplace=True)
        node = node.reset_index(drop=True)
        node = node.str.strip()
        new_pizza_nodes.append(node)
    for i in range(2,5):
        node = drink_nodes[i].dropna().reset_index(drop=True)
        node = node.iloc[:,0]
        node.drop_duplicates(keep='first',inplace=True)
        node = text_normalization(node)
        node.drop_duplicates(keep='first',inplace=True)
        node = node.reset_index(drop=True)
        node = node.str.strip()
        new_drink_nodes.append(node)
    return new_pizza_nodes, new_drink_nodes

In [344]:
sentences, parsed_tree, structured_sentence, decoupled_structured_sentence = read_dataset("./PIZZA_train.json")

In [345]:
'''
now we talk about the data preprocessing
What we should take into consideration? 
1- Word Normalization
2- Word Tokenization
Why we won't use Sentence segmentation? It's useless, orders are one sentence question no clear punctuation exist
'''
# NORMALIZATION
normalized_sentence = text_normalization(sentences.copy())

In [346]:
# TOKENIZATION
# INITIAL tokenization we may need better implementations
vocab, tokenized_sentence = tokenization(normalized_sentence.copy())


In [350]:
vocab_as_series = pd.Series(list(vocab))
vocab_as_series.to_csv("vocab.csv",index=False)
tokenized_sentence.to_csv("tokenized_sentences.csv",index=False)

In [353]:
del tokenized_sentence
del vocab_as_series
del normalized_sentence

In [None]:
pizza_orders, drink_orders = extract_pizza_drinks(decoupled_structured_sentence.copy())


In [None]:
pizza_nodes, drink_nodes = extract_nodes(pizza_orders,drink_orders)
pizza_nodes, drink_nodes = clean_extracted_nodes(pizza_nodes, drink_nodes)


In [None]:
# unfold the list
number, size, topping , quantity = pizza_nodes
drink_type, container_type, volume = drink_nodes
del pizza_nodes
del drink_nodes