In [3]:
import pandas as pd
import numpy as np
from num2words import num2words
from nltk.stem import SnowballStemmer
from nltk.tokenize import TreebankWordTokenizer
import re

In [4]:
# Show all rows
pd.set_option('display.max_rows', None)

# Show all columns
pd.set_option('display.max_columns', None)

In [5]:
# Read the Dataset given path to JSON file
# input: JSON file   -> output: list of size 4 (sentence, EXR, TOP, TOP_DECOUPLED) * number of strings
# we will use this to read the Training/ evaluation / test datasets. 
def read_dataset(path: str):
    data = pd.read_json(path, lines = True)
    columns = data.columns.tolist()
    parsed_json = [None]*len(columns)
    for i in range(0,len(columns)):
        parsed_json[i] = data[columns[i]] # IDK will it be easier to us to work with pandas or numpy
    return parsed_json # we store data in list of PD.Series for now

In [6]:
# the function takes SERIES of string sentences -> outputs SERIES of String sentences
def pre_text_normalization(sentences,flag=-1):
# Words to Lower
    if flag == -1:
        sentences = sentences.str.lower()
# SIZES
# after asking the TA, stating one format isn't a good idea so i won't standaradize the format
# so things like Party - size , party size to standaradize this i will just remove the '-'
    sentences = sentences.str.replace(r"-"," ",regex=True)
    sentences = sentences.str.replace(r"\s{2}",r" ",regex=True)
# sometimes they refer to pizza as pie : WE ONLY SELL PIZZA
    sentences = sentences.str.replace("pie", "pizza")

# Gluten - free we can leave it like that for now (may standardize it to gluten_free in future)

# Quantities
    '''
    Now i want to take into consider quantities that means less of topping
    something like 
    not much, not many, a little bit of, just a tiny bit of, just a bit, only a little, just a little, a bit, a little
    we will leave those quantities like that for now and see in the future if we will change them
    for quantities that mean much:
    a lot of, lots of
    '''

# Quantity like "a" pizza should be converted to "one" , only one, just one -> one
    sentences = sentences.str.replace(r"\ban?(?!\s+(bit|tiny|lot|little))\b","one",regex=True)
    sentences = sentences.str.replace(r"\b(?:only|just)\sone\b","one",regex=True)
    
# there are alot of Quantitative items 3 pies, Three pies ..
# normalize digits to words 
    sentences = sentences.str.replace(r"\b([0-9]+)\b",lambda match: num2words(int(match.group(1))),regex=True)
    
# Negation
    '''
    There is multiple ways of negation, what i found while searching:
    Without, hold the, With no(t), no, avoid
    i want complex words like (hold the , without) to be converted int no
    we won't change those for now because i want to try learn the context of negation
    '''
# TOPPINGS 
# (I think BBQ topping needs to be paired with things, it's always written as bbq_chicken, bbq_sauce, bbq_pulled_pork...)
# i think this is oversimplification and i will let the sequence model decide this
# To be decided later

# DRINKS
# sometimes people say pepsi, sometimes pepsis so i don't want plurals -> let's stem
    sentences = sentences.str.replace(r"\b(\w\w+)e?s\b",r"\1",regex=True)
# sometimes san pellegrino is said pellegrino only
    sentences = sentences.str.replace(r"\bsan\s(pellegrino)\b",r"\1",regex=True)
# sometimes wrote zeros as zeroe
    sentences = sentences.str.replace(r"\b(zero)e\b",r"\1",regex=True)
# sometimes people write iced instead of ice
    sentences = sentences.str.replace(r"\b(ice)d\b",r"\1",regex=True)
# DOCTOR PEPPER convert dr to doctor , peper to pepper
    sentences = sentences.str.replace(r"\bdr\b",r"doctor",regex=True)
    sentences = sentences.str.replace(r"\bpeper\b",r"pepper",regex=True)
    
    return sentences

In [7]:
# Stemmer 
def snow_ball_stemmer(vocab):
    stemmer = SnowballStemmer("english")
    if isinstance(vocab,set):
        vocab = set([stemmer.stem(word) for word in vocab])
        return vocab
    else:
        vocab = vocab.apply(lambda words: [stemmer.stem(word) for word in words])
        return vocab

In [8]:
# the function takes SERIES of string sentences -> outputs SET of vocab and , SERIES of list of tokens
def tokenization(sentences):
    # merge the whole series int one sentence to make the vocab extracting faster
    all_words = ' '.join(sentences)
    # used penn treebank tokenizer
    tokenizer = TreebankWordTokenizer()
    all_words = tokenizer.tokenize(all_words)
    
    # keep the unique 
    vocab = set(all_words)
    # i want to tokenize things like I'd to I , 'd
    # new tokens that will come from I'd, it's ,....
    # something is wrong with apstrophe
    
    # we use expand to split the series into Dataframe (I think this will accelerate when i try to map the word into other thing)
    pattern = r"\b([a-z]*)'([a-z]*)\b"
    sentences = sentences.str.replace(pattern, r"\1 '\2",regex=True)
    sentences = sentences.apply(tokenizer.tokenize)
    sentences.fillna("",inplace=True)
   # negation check regex : \b(?<=not?)(.*?)(?=(\.|,|$|and))\b (for the future maybe ?)
    return vocab, sentences

In [27]:
def extract_pizza_drinks(parsed_tree): # the tree is a SERIES of format that is like this (ORDER (DRINK,))....
# i extract PIZZAORDER node if exist, and DRINKORDER node if exist
    pizza_orders, drink_orders = None, None
    # remove the (ORDER and it's closing parenthesis at the end to ease next step
    extracted_words_before_parsing = r"(?:(?:\(ORDER\s+)|(?:\)))([^()]+)(?=[\s(]+)"
    none_words = parsed_tree.str.extractall(extracted_words_before_parsing).iloc[:,0].str.strip()
    none_words = none_words.dropna().reset_index(drop=True)
    order_pattern = r"(?<=\(ORDER)(.*)(?=\))"
    # this regex leads to 2 capture groups : anything after PIZZAORDER and before ) and anything after DRINKORDER and before )  
    pizza_drink_order_patterns = r" (?:\(PIZZAORDER\s*((?:\(?[^\)]+\)?\s*)*)\)\s*)?(?:\(DRINKORDER\s*((?:\(?[^\)]+\)?\s*)*)\)\s*)? "
# match non capturing group (PIZZA ORDER someshit) if exist, and match non capturing group (DRINKORDER someshit) if exist
# why non capturing? because i don't want the PD.extract to put it in the resulted Dataframe
# in each group : search for (PIZZAORDER then space 0 more -it should be 1- then match "(" then
# anything that isn't ")" one or more -words- then space 0 or more then ) then space 0 or more IF EXIST same for DRINK
    extracted_orders = parsed_tree.str.extractall(order_pattern).iloc[:,0].str.strip()
    extracted_PIZZA_DRINK = parsed_tree.str.extractall(pizza_drink_order_patterns)
    
    drink_orders = extracted_PIZZA_DRINK[1]
    pizza_orders = extracted_PIZZA_DRINK[0]
    drink_orders = drink_orders.dropna().reset_index(drop=True)
    pizza_orders = pizza_orders.dropna().reset_index(drop=True)

    pizza_orders = pizza_orders.replace(r"(?<=\))(.*?)(?=\()",r"(NONE \1)",regex=True)
    pizza_orders = pizza_orders.replace(r"\(NONE\s*\)", "",regex=True)
    drink_orders = drink_orders.replace(r"(?<=\))(.*?)(?=\()",r"(NONE \1)",regex=True)
    drink_orders = drink_orders.replace(r"\(NONE\s*\)", "",regex=True)
    
    del extracted_orders
    del extracted_PIZZA_DRINK
    return pizza_orders, drink_orders, none_words


In [10]:
def extract_nodes(pizza_orders,drink_orders):
    drink_nodes, pizza_nodes = [] ,[]
    if np.any(pizza_orders) :
        pizza_node_attributes = ["NUMBER","SIZE","NONE","TOPPING","QUANTITY","STYLE"]
        for attribute in pizza_node_attributes:
            node_pattern = r"(?<=\("+attribute+r")(.*?)(?=\))"
            pizza_nodes.append(pizza_orders.str.extract(node_pattern))
            
    if np.any(drink_orders) :
        drink_node_attributes = ["NUMBER","SIZE","NONE", "DRINKTYPE","CONTAINERTYPE","VOLUME"]
        for attribute in drink_node_attributes:
            node_pattern = r"(?<=\("+attribute+r")(.*?)(?=\))"
            drink_nodes.append(drink_orders.str.extract(node_pattern))
    return pizza_nodes, drink_nodes

In [11]:
def clean_extracted_nodes(pizza_nodes, drink_nodes):
    # i want to refine the extracted nodes since the one parsed from previous step has
    # alot of nans so i will drop those, normalize the text and drop the duplicates
    # after this step i can start labling the text
    new_pizza_nodes, new_drink_nodes = [], []
    for i in range(0,6):
        node = pizza_nodes[i].dropna().reset_index(drop=True)
        if i < 3: # for size and number and none
            node = pd.concat([node,drink_nodes[i].dropna().reset_index(drop=True)],axis =0, ignore_index=True)
        node = node.iloc[:,0]
        node.drop_duplicates(keep='first',inplace=True)
        node = pre_text_normalization(node)
        node.drop_duplicates(keep='first',inplace=True)
        node = node.reset_index(drop=True)
        node = node.str.strip()
        new_pizza_nodes.append(node)
    for i in range(3,6):
        node = drink_nodes[i].dropna().reset_index(drop=True)
        node = node.iloc[:,0]
        node.drop_duplicates(keep='first',inplace=True)
        node = pre_text_normalization(node)
        node.drop_duplicates(keep='first',inplace=True)
        node = node.reset_index(drop=True)
        node = node.str.strip()
        new_drink_nodes.append(node)
    return new_pizza_nodes, new_drink_nodes

In [11]:
sentences, parsed_tree, structured_sentence, decoupled_structured_sentence = read_dataset("./PIZZA_train.json")

In [17]:
structured_sentence = pd.read_csv("./TOP.csv")

In [12]:
'''
now we talk about the data preprocessing
What we should take into consideration? 
1- Word Normalization
2- Word Tokenization
Why we won't use Sentence segmentation? It's useless, orders are one sentence question no clear punctuation exist
'''
# NORMALIZATION
normalized_sentence = pre_text_normalization(sentences.copy())

In [20]:
# TOKENIZATION
# INITIAL tokenization we may need better implementations
vocab, tokenized_sentence = tokenization(normalized_sentence.copy())


In [21]:
vocab = snow_ball_stemmer(vocab)
tokenized_sentence = snow_ball_stemmer(tokenized_sentence)

In [22]:
vocab_as_series = pd.Series(list(vocab))
vocab_as_series.to_csv("vocab.csv",index=False)
tokenized_sentence.to_csv("tokenized_sentences.csv",index=False)

In [23]:
del tokenized_sentence
del vocab_as_series
del normalized_sentence

In [24]:
del decoupled_structured_sentence
del parsed_tree
del sentences

In [21]:
structured_sentence = structured_sentence["train.TOP"]

In [40]:
pizza_orders, drink_orders, none_words = extract_pizza_drinks(structured_sentence.copy())

In [None]:
none_words = none_words.drop_duplicates()
none_words = pre_text_normalization(none_words)
none_words = none_words.reset_index(drop=True)
vocab, none_words = tokenization(none_words)

In [44]:
vocab = snow_ball_stemmer(vocab)
none_words = snow_ball_stemmer(none_words)

In [47]:
pizza_nodes, drink_nodes = extract_nodes(pizza_orders,drink_orders)
pizza_nodes, drink_nodes = clean_extracted_nodes(pizza_nodes, drink_nodes)


In [48]:
# unfold the list
number, size, none, topping , quantity, style = pizza_nodes
drink_type, container_type, volume = drink_nodes
del pizza_nodes
del drink_nodes

In [49]:
labels = [number, size, none, topping, quantity, style, drink_type, container_type, volume]
csv_file_names = ["number", "size", "none","topping","quantity","style","drink_type","container_type","volume"]
none_vocab = vocab
for label, csv in zip(labels, csv_file_names):
    vocab, tokens = tokenization(label)
    vocab = snow_ball_stemmer(vocab)
    if csv == "none":
        vocab.update(none_vocab)
    vocab = pd.Series(list(vocab))
    vocab.to_csv(f"./labels/{csv}.csv", index=False)
    tokens.to_csv(f"./tokens/{csv}_tokens.csv", index=False)
