In [None]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.metrics.distance import jaccard_distance, edit_distance
from nltk.stem import SnowballStemmer
from tqdm import tqdm_notebook as tqdm

from typing import Dict, Tuple
import spacy
from spacy.lang.pt.examples import sentences 

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 500)

In [None]:
nlp = spacy.load('pt_core_news_sm')
doc = ET.parse('Project1/KB.xml')
sno = SnowballStemmer('portuguese')

## Construct the dataset
* As there are some questions that repeat and those map to different answers at the same time, we decided to keep only the first question -> answer mapping

In [None]:
data = []
reference: Dict[str, Tuple[int, str]] = {}

for document in doc.findall('documento'):
    for faq in document.find('faq_list').findall('faq'):
        answer = faq.find('resposta')
        idx = answer.attrib['id']
        answer = answer.text.strip()
        
        for question in faq.find('perguntas').findall('pergunta'):
            question = question.text.strip()
            if question == '' or question in reference:
                continue
            data.append((question, answer, idx))
            reference[question] = (idx, answer)

data = pd.DataFrame(data, columns=['question', 'answer', 'answer_id'])

In [None]:
filtered = []
annotated = []
filter_annotated = []
tokenized_questions = []
lemma_filtered = []
stem_filtered = []
stem = []

for i, row in tqdm(data.iterrows()):
    doc = nlp(row['question'])
    tokenized_questions.append(' '.join([token.text.lower() for token in doc]))
    annotated.append(' '.join([token.text.lower() + ' ' + token.pos_ for token in doc]))
    filtered.append(' '.join([token.text.lower() for token in doc if token.is_alpha and not token.is_stop]))
    filter_annotated.append(' '.join([token.text.lower() + ' ' + token.pos_ for token in doc if token.is_alpha and not token.is_stop]))
    lemma_filtered.append(' '.join([token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop]))
    stem_filtered.append(' '.join([sno.stem(token.text.lower()) for token in doc if token.is_alpha and not token.is_stop]))
    stem.append(' '.join([sno.stem(token.text.lower()) for token in doc]))

data['question'] = tokenized_questions
data['filtered'] = filtered
data['annotated'] = annotated
data['filter_annotated'] = filter_annotated
data['lemma_filtered'] = filter_annotated
data['stem_filtered'] = stem_filtered
data['stem'] = stem
data.head()

In [None]:
train, test = train_test_split(data, test_size=0.3)
train.sort_index(inplace=True)
test.sort_index(inplace=True)
print(f'{len(data)} -> {len(train)}, {len(test)}')
data.head()

In [None]:
# Make answers that don't show up in the train data appear as `answer_id = 0` in test
available_answer_ids = list(set(train['answer_id'].values))
for i, row in test.iterrows():
    if row['answer_id'] not in available_answer_ids:
        test.at[i, 'answer_id'] = 0
        print('No available id for row:', i)
test.head()

## Experiments
* Word-level distance measures: nltk.metrics.distance.*
* scale distance by the length of the sentence (ok to have few differences in long sentences)
* Annotate the initial sentence with spaCy (POS tags, etc)
* remove stopwords, punctuation, and everything that is not alphabetical

In [None]:
def query(q, corpus, distance, threshold=10, column='question'):
    """Return answer_id with the most similar question"""
    distances = [distance(q, row[column]) for i, row in corpus.iterrows()]
    best_match = np.argmin([distances])
    
    if distances[best_match] >= threshold:
        return 0, distances[best_match]
    return corpus.iloc[best_match]['answer_id'], distances[best_match]

In [None]:
cache = {}
def tfidf(a, b):
    if a not in cache: cache[a] = vectorizer.transform([a])
    if b not in cache: cache[b] = vectorizer.transform([b])
    avec = cache[a]
    bvec = cache[b]
    cosine_similarities = linear_kernel(avec, bvec).flatten()
    return 1 - cosine_similarities[0]


distance_thresholds = [
    ('tf-idf', tfidf, 0.6),
    ('tf-idf', tfidf, 0.7),
    ('tf-idf', tfidf, 0.8),
    ('jaccard', lambda a, b: jaccard_distance(set(a.split()), set(b.split())), 0.6),
    ('jaccard', lambda a, b: jaccard_distance(set(a.split()), set(b.split())), 0.7),
    ('jaccard', lambda a, b: jaccard_distance(set(a.split()), set(b.split())), 0.8),
    ('edit_distance', lambda a, b: edit_distance(a.split(), b.split()), 8),
    ('edit_distance', lambda a, b: edit_distance(a.split(), b.split()), 16),
    ('edit_distance', lambda a, b: edit_distance(a.split(), b.split()), 24),
]
columns = [
    'question',
    'filtered',
    'annotated',
    'filter_annotated',
    'lemma_filtered',
    'stem_filtered',
    'stem',
]

## Evaluate

In [None]:
def evaluate(column, distance, threshold, verbose=True):
    pred, label, distances = [], [], []
    for i, row in tqdm(test.iterrows(), total=len(test), disable=not verbose):
        ans_id, dist = query(
            row[column],
            corpus=train,
            distance=distance,
            threshold=threshold,
            column=column,
        )
        pred.append(int(ans_id))
        distances.append(dist)
        label.append(int(row['answer_id']))

    acc = accuracy_score(label, pred) * 100
    print(f'ACC: {acc:.2f}')
    return pred, label, distances

In [None]:
for c in columns[::-1]:
    cache = {}
    vectorizer = TfidfVectorizer()
    trainvec = vectorizer.fit_transform(train[c].values)
    
    for distance_name, distance_measure, threshold in distance_thresholds:
        print(f'Evaluating dist: {distance_name}, threshold: {threshold}, column: {c}', end='...')
        pred, label, distances = evaluate(column=c, distance=distance_measure, threshold=threshold, verbose=True)
#         for p, l, d  in zip(pred, label, distances):
#             print(f'pred: {p}, label: {l}, dist: {d}')

In [None]:
# Evaluating dist: tf-idf, threshold: 0.6, column: question...ACC: 79.69
# Evaluating dist: tf-idf, threshold: 0.7, column: question...ACC: 79.83
# Evaluating dist: tf-idf, threshold: 0.8, column: question...ACC: 79.69
# Evaluating dist: jaccard, threshold: 0.6, column: question...ACC: 69.10
# Evaluating dist: jaccard, threshold: 0.7, column: question...ACC: 71.24
# Evaluating dist: jaccard, threshold: 0.8, column: question...ACC: 71.39
# Evaluating dist: edit_distance, threshold: 8, column: question...ACC: 49.50
# Evaluating dist: edit_distance, threshold: 16, column: question...ACC: 56.22
# Evaluating dist: edit_distance, threshold: 24, column: question...ACC: 57.08
# Evaluating dist: tf-idf, threshold: 0.6, column: filtered...ACC: 76.54
# Evaluating dist: tf-idf, threshold: 0.7, column: filtered...ACC: 76.68
# Evaluating dist: tf-idf, threshold: 0.8, column: filtered...ACC: 76.54
# Evaluating dist: jaccard, threshold: 0.6, column: filtered...ACC: 72.82
# Evaluating dist: jaccard, threshold: 0.7, column: filtered...ACC: 74.68
# Evaluating dist: jaccard, threshold: 0.8, column: filtered...ACC: 75.25
# Evaluating dist: edit_distance, threshold: 8, column: filtered...ACC: 59.37
# Evaluating dist: edit_distance, threshold: 16, column: filtered...ACC: 60.94
# Evaluating dist: edit_distance, threshold: 24, column: filtered...ACC: 61.09
# Evaluating dist: tf-idf, threshold: 0.6, column: annotated...ACC: 79.40
# Evaluating dist: tf-idf, threshold: 0.7, column: annotated...ACC: 79.26
# Evaluating dist: tf-idf, threshold: 0.8, column: annotated...ACC: 79.26
# Evaluating dist: jaccard, threshold: 0.6, column: annotated...ACC: 68.10
# Evaluating dist: jaccard, threshold: 0.7, column: annotated...ACC: 68.10
# Evaluating dist: jaccard, threshold: 0.8, column: annotated...ACC: 68.10
# Evaluating dist: edit_distance, threshold: 8, column: annotated...ACC: 33.05
# Evaluating dist: edit_distance, threshold: 16, column: annotated...ACC: 46.78
# Evaluating dist: edit_distance, threshold: 24, column: annotated...ACC: 49.79
# Evaluating dist: tf-idf, threshold: 0.6, column: filter_annotated...ACC: 76.39
# Evaluating dist: tf-idf, threshold: 0.7, column: filter_annotated...ACC: 76.25
# Evaluating dist: tf-idf, threshold: 0.8, column: filter_annotated...ACC: 76.25
# Evaluating dist: jaccard, threshold: 0.6, column: filter_annotated...ACC: 70.24
# Evaluating dist: jaccard, threshold: 0.7, column: filter_annotated...ACC: 69.67
# Evaluating dist: jaccard, threshold: 0.8, column: filter_annotated...ACC: 69.67
# Evaluating dist: edit_distance, threshold: 8, column: filter_annotated...ACC: 49.21
# Evaluating dist: edit_distance, threshold: 16, column: filter_annotated...ACC: 53.51
# Evaluating dist: edit_distance, threshold: 24, column: filter_annotated...ACC: 54.22
# Evaluating dist: tf-idf, threshold: 0.6, column: lemma_filtered...ACC: 76.39
# Evaluating dist: tf-idf, threshold: 0.7, column: lemma_filtered...ACC: 76.25
# Evaluating dist: tf-idf, threshold: 0.8, column: lemma_filtered...ACC: 76.25
# Evaluating dist: jaccard, threshold: 0.6, column: lemma_filtered...ACC: 70.24
# Evaluating dist: jaccard, threshold: 0.7, column: lemma_filtered...ACC: 69.67
# Evaluating dist: jaccard, threshold: 0.8, column: lemma_filtered...ACC: 69.67
# Evaluating dist: edit_distance, threshold: 8, column: lemma_filtered...ACC: 49.21
# Evaluating dist: edit_distance, threshold: 16, column: lemma_filtered...ACC: 53.51
# Evaluating dist: edit_distance, threshold: 24, column: lemma_filtered...ACC: 54.22
# Evaluating dist: tf-idf, threshold: 0.6, column: stem_filtered...ACC: 78.40
# Evaluating dist: tf-idf, threshold: 0.7, column: stem_filtered...ACC: 78.40
# Evaluating dist: tf-idf, threshold: 0.8, column: stem_filtered...ACC: 78.40
# Evaluating dist: jaccard, threshold: 0.6, column: stem_filtered...ACC: 75.82
# Evaluating dist: jaccard, threshold: 0.7, column: stem_filtered...ACC: 76.82
# Evaluating dist: jaccard, threshold: 0.8, column: stem_filtered...ACC: 76.68
# Evaluating dist: edit_distance, threshold: 8, column: stem_filtered...ACC: 61.23
# Evaluating dist: edit_distance, threshold: 16, column: stem_filtered...ACC: 62.66
# Evaluating dist: edit_distance, threshold: 24, column: stem_filtered...ACC: 62.80
# Evaluating dist: tf-idf, threshold: 0.6, column: stem...ACC: 83.83
# Evaluating dist: tf-idf, threshold: 0.7, column: stem...ACC: 83.69
# Evaluating dist: tf-idf, threshold: 0.8, column: stem...ACC: 83.69
# Evaluating dist: jaccard, threshold: 0.6, column: stem...ACC: 70.10
# Evaluating dist: jaccard, threshold: 0.7, column: stem...ACC: 71.53
# Evaluating dist: jaccard, threshold: 0.8, column: stem...ACC: 71.24
# Evaluating dist: edit_distance, threshold: 8, column: stem...ACC: 48.64
# Evaluating dist: edit_distance, threshold: 16, column: stem...ACC: 54.79
# Evaluating dist: edit_distance, threshold: 24, column: stem...ACC: 55.51