# Preprocessing

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
import spacy
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Data Loading

In [3]:
# In the future load dataset from repository instead of json file

df = pd.read_json('./datas/training_set.json')

sentences = list(df['sentence'])
labels = list(df['intent'])
print(sentences[217])
print(labels[:3])
print(sentences.index(max(sentences, key=len)))

:)
['irrelevant', 'irrelevant', 'purchase']
2937


## Text preprocessing

In [4]:
nlp = spacy.load('fr_core_news_md')
print(len(nlp(sentences[2937])))

122


In [5]:
# Remove some special characters and split sentences
to_pop = []

def clean(sentences):
    
    clean_text = []
    
        
    # remove special characters
    for i,s in enumerate(sentences):
        cs = re.sub(r'[^ A-Za-z0-9éèàêî€]', '', s)
        #ls = cs.lower()
        ts = nlp(cs)

        # Exclude empty sentences
        if len(ts) > 0: clean_text.append(ts)
        else: to_pop.append(i)
        
    return clean_text

# Transform words into their vector representation
def vectorize(clean_text):
    
    vectorized_text = []
    null_vector = np.zeros(300)
    
    for s in clean_text:
        vects = [w.vector if w.has_vector else null_vector for w in s]
        vectorized_text.append(vects)
        
    return vectorized_text

# add padding to sentences to have same size datas
def pad(sequences):
    return pad_sequences(sequences, dtype='float32', padding='post')

# Apply the whole pipeline to input sentences and return them as numpy array object
def preprocess_sentences(sentences):
    return np.asarray(pad(vectorize(clean(sentences))))
    
# Displays doc object tokens and size
def printDoc(doc):
    
    print(f'length: {len(doc)}')
    for t in doc:
        print(t.text)

In [6]:
preprocessed_sentences = preprocess_sentences(sentences)
print(preprocessed_sentences.shape)

(6032, 107, 300)


## Intents preprocessing

In [7]:
# List of all intents in the same order as the model's output
intents = ["find-train", "irrelevant", "find-flight", "find-restaurant", "purchase", "find-around-me", "provide-showtimes", "find-hotel"]

# One hot encode labels (take string representation of the label)
def label2vec(label):
    assert label in intents
    
    idx = intents.index(label)
    vec = np.zeros(len(intents))
    vec[idx] = 1
    return vec

In [9]:
for idx in reversed(to_pop):
    labels.pop(idx)
preprocessed_labels = np.asarray(list(map(label2vec, labels)))
print(preprocessed_labels.shape)

(6032, 8)


## Save preprocessed datas

In [48]:
s = nlp("bonjour les beaux enfants 10")
parsed = list(filter(lambda x: x.pos != 90, s))
print(parsed)
" ".join(list(map(lambda x: x.text, parsed)))

[bonjour, beaux, enfants, 10]


'bonjour beaux enfants 10'