In [1]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import json

# Import created data pipelines
from nlp_pipelines import *
pipelines = [
    nltk_stemmer, nltk_no_POS_lemmatizer, nltk_POS_lemmatizer
]
pipeline_names = [
    "nltk_stemmer", "nltk_no_POS_lemmatizer", "nltk_POS_lemmatizer"
]

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Necessary downloads
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [3]:
# Preprocessing intents data
intents = json.loads(open('intents.json').read())

results = [[] for _ in range(len(pipelines))] # list of pipeline results for comparing pipelines
words = []  # all possible words in intents vocabulary
classes = []  # tags
docs = []  # pairs of toekenized word patterns and corresponding classes

for intent in intents['intents']:
    for pattern in intent['patterns']:
        tokens = nltk.word_tokenize(pattern)
        words.extend([token for token in tokens if token not in "?!,."])
        docs.append((tokens, intent['tag']))

        # Apply pipelines to pattern and save results for comparison
        for i in range(len(pipelines)):
            results[i].extend(pipelines[i](pattern))
        
        if intent['tag'] not in classes:
            classes.append(intent['tag'])


We now evaluate the pipelines by comparing the pipeline results to the unprocessed list of tokens.

In [4]:
for i, result in enumerate(results):
    print(f"***{pipeline_names[i]}***")
    # Print which words were modified
    for j, word in enumerate(words):
        if word.lower() != result[j].lower():
            print(f"{word} => {result[j].lower()}")
    print("~"*25)

***nltk_stemmer***
there => ther
Whats => what
this => thi
Resume => resum
this => thi
Give => giv
summary => sum
Ilia => il
Briefly => brief
tell => tel
little => littl
Give => giv
Ilia => il
experiences => expery
qualifications => qual
Tell => tel
Ilia => il
Whats => what
Ilia => il
Ilia => il
skills => skil
does => doe
Ilia => il
have => hav
does => doe
your => yo
like => lik
tools => tool
languages => langu
Tell => tel
your => yo
some => som
tools => tool
technologies => technolog
use => us
experience => expery
does => doe
Ilia => il
have => hav
Projects => project
Ilia => il
worked => work
your => yo
experience => expery
your => yo
Goodbye => goodby
~~~~~~~~~~~~~~~~~~~~~~~~~
***nltk_no_POS_lemmatizer***
Ilia => ilium
Ilia => ilium
experiences => experience
qualifications => qualification
us => u
Ilia => ilium
Ilia => ilium
Ilia => ilium
skills => skill
does => doe
Ilia => ilium
does => doe
tools => tool
languages => language
tools => tool
technologies => technology
does => doe
Ili

The nltk_POS_lemmatizer pipeline produces the most desirable results and will thus be used for preprocessing the chatbot model input data from hereon.