## KeyPhrase extraction
* Use SemEval 2010 dataset - [train](https://github.com/boudinfl/ake-datasets/blob/master/datasets/SemEval-2010/train/) dataset for TF-IDF vectorization
* Use SemEval 2010 dataset - [test](https://github.com/boudinfl/ake-datasets/blob/master/datasets/SemEval-2010/test/) for inference

In [1]:
import re
import operator
import json
import numpy as np
from pathlib import Path
from glob import glob
from nltk import ngrams
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from xml.etree import ElementTree
from collections import Counter

from tqdm import tqdm_notebook as tqdm

In [2]:
from nltk.stem.snowball import SnowballStemmer
sno = SnowballStemmer('english')

In [3]:
def read(directory):
    docs = {}
    for doc_path in tqdm(glob(f'{directory}/*.xml')):
        doc = ElementTree.parse(doc_path)
        sentences = []
        for sentence in doc.find('document').find('sentences').findall('sentence'):
            sentences.append(' '.join([token.find('lemma').text.lower() + '~' + token.find('POS').text
                                       for token in sentence.find('tokens').findall('token')]))

        docs[doc_path.split('/')[-1].split('.')[0]] = '\n'.join(sentences)
    return docs

In [4]:
train_sentences = read('ake-datasets/datasets/SemEval-2010/train')
test_sentences = read('ake-datasets/datasets/SemEval-2010/test')
len(train_sentences), len(test_sentences)

HBox(children=(IntProgress(value=0, max=144), HTML(value='')))




HBox(children=(IntProgress(value=0), HTML(value='')))




(144, 100)

In [5]:
pattern = re.compile(r'(((\w+~JJ)* (\w+~NN)+ (\w+~IN))?(\w+~JJ)+ (\w+~NN)+)+')

In [6]:
train_candidates = {doc_id: [candidate[0] for candidate in re.findall(pattern, doc)] for doc_id, doc in train_sentences.items()}
train_candidates = {doc_id: [' '.join([w.split('~')[0] for w in candidate.split()]) for candidate in candidates] for doc_id, candidates in train_candidates.items()}
train_sentences = {doc_id: ' '.join([w.split('~')[0] for w in sentences.split()]) for doc_id, sentences in train_sentences.items()}
train_frequencies = {doc_id: Counter(
                                [' '.join(gram) for gram in ngrams(doc.split(), 1)] + \
                                [' '.join(gram) for gram in ngrams(doc.split(), 2)] + \
                                [' '.join(gram) for gram in ngrams(doc.split(), 3)])
                    for doc_id, doc in train_sentences.items()}



In [7]:
test_candidates = {doc_id: [candidate[0] for candidate in re.findall(pattern, doc)] for doc_id, doc in test_sentences.items()}
test_candidates = {doc_id: [' '.join([w.split('~')[0] for w in candidate.split()]) for candidate in candidates] for doc_id, candidates in test_candidates.items()}
test_sentences = {doc_id: ' '.join([w.split('~')[0] for w in sentences.split()]) for doc_id, sentences in test_sentences.items()}
test_frequencies = {doc_id: Counter(
                                [' '.join(gram) for gram in ngrams(doc.split(), 1)] + \
                                [' '.join(gram) for gram in ngrams(doc.split(), 2)] + \
                                [' '.join(gram) for gram in ngrams(doc.split(), 3)])
                    for doc_id, doc in test_sentences.items()}

In [8]:
vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 3))
trainvec = vectorizer.fit_transform(train_sentences.values())
feature_names = vectorizer.get_feature_names()

In [9]:
with open('ake-datasets/datasets/SemEval-2010/references/test.author.stem.json', 'r') as f:
    target = json.load(f)
    target = {doc_name: [k[0] for k in keyphrases] for doc_name, keyphrases in target.items()}

In [10]:
target['C-14']

['collabor target detect',
 'deploy',
 'exposur',
 'sensor network',
 'valu fusion']

In [11]:
test_candidates['C-14']

['collaborative target',
 'certain level',
 'sequential deployment',
 'recent advance',
 'specific event',
 'specific region',
 'such system',
 'civil infrastructure',
 'other means',
 'random placement',
 'local observation',
 'global decision',
 'local processing',
 'different node',
 'local observation',
 'possible measure',
 'practical application',
 'precise placement',
 'random deployment',
 'several solution',
 'analytical study',
 'optimum solution',
 'rectangular sensor',
 'geometric distance',
 'particular measurement',
 'total energy',
 'basic approach',
 'other sensor',
 'individual sensor',
 'local decision',
 'local decision',
 'additive white',
 'square distribution',
 'false target',
 'false alarm',
 'square distribution',
 'square distribution',
 'false target',
 'above equation',
 'analytic basis',
 'false alarm',
 'such algorithm',
 'unauthorized activity',
 'unauthorized traversal',
 'stochastic characterization',
 'false alarm',
 'east periphery',
 'net probability

In [12]:
# TODO:
# * Implement score function
# * Use some other dataset instead of SemEval
# * Debug the scores for tfidf

from math import log
def score(t, d, k1=1.2, b=0.75):
    """
    :param t: term
    :param d: document-id in test dataset
    
    ftd = f(t, d): term frequency
    avgdl = mean([len(doc) for doc in train])
    N = len(train)
    nt = n(t) = sum(1 for doc in train if t in doc)
    """
    N = len(train_sentences)
    nt = sum(1 for doc in train_frequencies if t in doc)
    avgdl = np.mean([sum(frequencies.values()) for frequencies in test_frequencies.values()])
    ftd = 1. * test_frequencies[d][t] / test_frequencies[d].most_common(1)[0][1]
    ld = sum(test_frequencies[d].values())
    
    tf = (ftd * (k1 + 1)) / (ftd + k1 * (1 - b + b * ld / avgdl))
    idf = log((N - nt + 0.5) / (nt + 0.5))
    return tf * idf

In [13]:
score('active learning', 'H-11')

0.839753401748323

In [14]:
def extract_keyphrases(doc_id, nb_keywords=5):
    scores = {candidate: score(candidate, doc_id) for candidate in test_candidates[doc_id]}
    scores = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)[:nb_keywords]
    return [keyphrase for keyphrase, score in scores]

In [None]:
predictions = {}
for doc_id, doc in tqdm(test_sentences.items()):
    keyphrases = extract_keyphrases(doc_id, nb_keywords=5)
    predictions[doc_id] = keyphrases

HBox(children=(IntProgress(value=0), HTML(value='')))

In [None]:
predictions = {doc_id: [sno.stem(candidate) for candidate in candidates] for doc_id, candidates in predictions.items()}
target = {doc_id: [sno.stem(candidate) for candidate in candidates] for doc_id, candidates in target.items()}

In [None]:
predictions['C-34'], target['C-34']

In [None]:
precision, recall, f1, precision_5 = [], [], [], []
for doc_id in sorted(predictions.keys()):
    p = set(predictions[doc_id])
    t = set(target[doc_id])
    at_5 = set(target[doc_id][:5])

    # We always predict 5 keywords
    precision.append(len(p.intersection(t)) / len(p))
    recall.append(len(p.intersection(t)) / len(t))
    f1.append(0 if precision[-1] + recall[-1] == 0 else 2 * precision[-1] * recall[-1] / (precision[-1] + recall[-1]))
    precision_5.append(len(p.intersection(at_5)) / len(p))
    print(f'{doc_id:5} -> Precision: {precision[-1]:.2f} Recall: {recall[-1]:.2f} F1: {f1[-1]:.2f} precision@5: {precision_5[-1]:.2f}')

print()
print('--------------Mean-------------')
print(f'Precision: {np.mean(precision):.2f} Recall: {np.mean(recall):.2f} F1: {np.mean(f1):.2f}   precision@5: {np.mean(precision_5):.2f}')