## KeyPhrase extraction
* Use Inspec dataset (abstracts) - [train](https://github.com/boudinfl/ake-datasets/blob/master/datasets/Inspec/train/) dataset for supervised training
* Use Inspec dataset (abstracts) - [test](https://github.com/boudinfl/ake-datasets/blob/master/datasets/Inspec/test/) for inference

In [1]:
import re
import operator
import json
import numpy as np
import pandas as pd
from pathlib import Path
from glob import glob
from nltk import ngrams
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from xml.etree import ElementTree
from collections import Counter
from math import log

from tqdm import tqdm_notebook as tqdm

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 500)

In [2]:
from nltk.stem.snowball import SnowballStemmer
sno = SnowballStemmer('english')

In [3]:
def read(directory):
    docs = {}
    for doc_path in tqdm(glob(f'{directory}/*.xml')):
        doc = ElementTree.parse(doc_path)
        sentences = []
        for sentence in doc.find('document').find('sentences').findall('sentence'):
            sentences.append(' '.join([token.find('lemma').text.lower() + '~' + token.find('POS').text
                                       for token in sentence.find('tokens').findall('token')]))

        docs[doc_path.split('/')[-1].split('.')[0]] = '\n'.join(sentences)
    return docs

In [4]:
train_sentences = read('ake-datasets/datasets/Inspec/train')
test_sentences = read('ake-datasets/datasets/Inspec/test')
len(train_sentences), len(test_sentences)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




(1000, 500)

### Classification method
* Select all unigrams, bigrams, trigrams from the doc
* Create a pandas data frame with
    * Features: TF, IDF, TF-IDF, BM25, length of the token
    * Class: 1 if it's a target keyphrase 0 if not
* Use xgboost for the classification

In [5]:
pattern = re.compile(r'(((\w+~JJ)* (\w+~NN)+ (\w+~IN))?(\w+~JJ)+ (\w+~NN)+)+')

In [6]:
train_candidates = {doc_id: [candidate[0] for candidate in re.findall(pattern, doc)] for doc_id, doc in train_sentences.items()}
train_candidates = {doc_id: [' '.join([w.split('~')[0] for w in candidate.split()]) for candidate in candidates] for doc_id, candidates in train_candidates.items()}
train_sentences = {doc_id: ' '.join([w.split('~')[0] for w in sentences.split()]) for doc_id, sentences in train_sentences.items()}
train_frequencies = {doc_id: Counter(
                                [' '.join(gram) for gram in ngrams(doc.split(), 1)] + \
                                [' '.join(gram) for gram in ngrams(doc.split(), 2)] + \
                                [' '.join(gram) for gram in ngrams(doc.split(), 3)])
                    for doc_id, doc in train_sentences.items()}

test_candidates = {doc_id: [candidate[0] for candidate in re.findall(pattern, doc)] for doc_id, doc in test_sentences.items()}
test_candidates = {doc_id: [' '.join([w.split('~')[0] for w in candidate.split()]) for candidate in candidates] for doc_id, candidates in test_candidates.items()}
test_sentences = {doc_id: ' '.join([w.split('~')[0] for w in sentences.split()]) for doc_id, sentences in test_sentences.items()}
test_frequencies = {doc_id: Counter(
                                [' '.join(gram) for gram in ngrams(doc.split(), 1)] + \
                                [' '.join(gram) for gram in ngrams(doc.split(), 2)] + \
                                [' '.join(gram) for gram in ngrams(doc.split(), 3)])
                    for doc_id, doc in test_sentences.items()}


In [7]:
def tf(d, t, frequencies):
    return 1. * frequencies[d][t] / frequencies[d].most_common(1)[0][1]

def idf(t, frequencies):
    N = 1. * len(frequencies)
    nt = sum(1 for doc in frequencies.values() if t in doc)
    return log(N / nt) if N != 0 and nt != 0 else 0

dls = {}
def bm25(t, d, frequencies, background_frequencies, k1=1.2, b=0.75):
    """
    :param t: term
    :param d: document-id in test dataset
    
    ftd = f(t, d): term frequency
    avgdl = mean([len(doc) for doc in train])
    N = len(train)
    nt = n(t) = sum(1 for doc in train if t in doc)
    """
    N = len(background_frequencies)
    nt = sum(1 for doc in background_frequencies.values() if t in doc)
    # Dangerous but works for our train/test split
    if len(background_frequencies) not in dls:
        dls[len(background_frequencies)] = np.mean([sum(freq.values()) for freq in background_frequencies.values()])
    avgdl = dls[len(background_frequencies)]
    
    ftd = 1. * frequencies[d][t] / frequencies[d].most_common(1)[0][1]
    ld = sum(frequencies[d].values())
    
    tf = (ftd * (k1 + 1)) / (ftd + k1 * (1 - b + b * ld / avgdl))
    idf = log((N - nt + 0.5) / (nt + 0.5))
    return tf * idf

In [8]:
tf(t='datum', d='1390', frequencies=train_frequencies), \
idf(t='datum', frequencies=train_frequencies), \
bm25(t='datum', d='1390', frequencies=train_frequencies, background_frequencies=train_frequencies)

(0.8, 1.9105430052180221, 1.8559195292811537)

In [9]:
train_data = pd.DataFrame([
    {'id': doc_id + ':' + str(i), 'token': candidate} 
    for doc_id, candidates in train_candidates.items()
        for i, candidate in enumerate(candidates)
])
train_data.set_index('id', inplace=True)
print(train_data.shape)
train_data.head(100)

(10540, 1)


Unnamed: 0_level_0,token
id,Unnamed: 1_level_1
1390:0,many organisation
1390:1,valuable asset
1390:2,wide datum
604:0,geographic area
604:1,preventive inspection
604:2,new application
604:3,same cost
604:4,preferred solution
604:5,geographical information
604:6,geographic area


In [10]:
test_data = pd.DataFrame([
    {'id': doc_id + ':' + str(i), 'token': candidate} 
    for doc_id, candidates in test_frequencies.items()
        for i, candidate in enumerate(candidates)
])
test_data.set_index('id', inplace=True)
print(test_data.shape)
test_data.head()

(161326, 1)


Unnamed: 0_level_0,token
id,Unnamed: 1_level_1
2128:0,a
2128:1,optimal
2128:2,control
2128:3,algorithm
2128:4,base


In [11]:
train_data['tf'] = [tf(d=i.split(':')[0], t=row['token'], frequencies=train_frequencies) for i, row in tqdm(train_data.iterrows(), total=len(train_data))]
train_data['idf'] = [idf(t=row['token'], frequencies=train_frequencies) for i, row in tqdm(train_data.iterrows(), total=len(train_data))]
train_data['tf-idf'] = train_data['tf'] * train_data['idf']
train_data['bm25'] = [bm25(t=row['token'], d=i.split(':')[0], frequencies=train_frequencies, background_frequencies=train_frequencies) for i, row in tqdm(train_data.iterrows(), total=len(train_data))]
train_data['len'] = [len(row['token']) for i, row in tqdm(train_data.iterrows(), total=len(train_data))]
train_data.head()

HBox(children=(IntProgress(value=0, max=10540), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10540), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10540), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10540), HTML(value='')))




Unnamed: 0_level_0,token,tf,idf,tf-idf,bm25,len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1390:0,many organisation,0.2,6.907755,1.381551,2.705831,17
1390:1,valuable asset,0.2,6.907755,1.381551,2.705831,14
1390:2,wide datum,0.0,0.0,0.0,0.0,10
604:0,geographic area,0.25,6.907755,1.726939,2.181924,15
604:1,preventive inspection,0.083333,6.907755,0.575646,0.809643,21


In [12]:
test_data['tf'] = [tf(d=i.split(':')[0], t=row['token'], frequencies=test_frequencies) for i, row in tqdm(test_data.iterrows(), total=len(test_data))]
test_data['idf'] = [idf(t=row['token'], frequencies=test_frequencies) for i, row in tqdm(test_data.iterrows(), total=len(test_data))]
test_data['tf-idf'] = test_data['tf'] * test_data['idf']
test_data['bm25'] = [bm25(t=row['token'], d=i.split(':')[0], frequencies=test_frequencies, background_frequencies=train_frequencies) for i, row in tqdm(test_data.iterrows(), total=len(test_data))]
test_data['len'] = [len(row['token']) for i, row in tqdm(test_data.iterrows(), total=len(test_data))]
test_data.head()

HBox(children=(IntProgress(value=0, max=161326), HTML(value='')))




HBox(children=(IntProgress(value=0, max=161326), HTML(value='')))




HBox(children=(IntProgress(value=0, max=161326), HTML(value='')))




HBox(children=(IntProgress(value=0, max=161326), HTML(value='')))




Unnamed: 0_level_0,token,tf,idf,tf-idf,bm25,len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2128:0,a,1.0,0.070422,0.070422,-3.176262,1
2128:1,optimal,0.166667,2.847312,0.474552,1.273193,7
2128:2,control,0.333333,2.024953,0.674984,1.09698,7
2128:3,algorithm,0.333333,1.883875,0.627958,1.16256,9
2128:4,base,0.166667,1.48722,0.24787,0.526747,4


In [13]:
with open('ake-datasets/datasets/Inspec/references/train.uncontr.json', 'r') as f:
    target = json.load(f)
    target = {doc_id: [k[0] for k in keyphrases] for doc_id, keyphrases in target.items()}
train_data['class'] = [int(row['token'] in target[i.split(':')[0]]) for i, row in tqdm(train_data.iterrows(), total=len(train_data))]
train_data.head()

HBox(children=(IntProgress(value=0, max=10540), HTML(value='')))




Unnamed: 0_level_0,token,tf,idf,tf-idf,bm25,len,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1390:0,many organisation,0.2,6.907755,1.381551,2.705831,17,0
1390:1,valuable asset,0.2,6.907755,1.381551,2.705831,14,0
1390:2,wide datum,0.0,0.0,0.0,0.0,10,0
604:0,geographic area,0.25,6.907755,1.726939,2.181924,15,0
604:1,preventive inspection,0.083333,6.907755,0.575646,0.809643,21,0


In [14]:
with open('ake-datasets/datasets/Inspec/references/test.uncontr.json', 'r') as f:
    target = json.load(f)
    target = {doc_id: [k[0] for k in keyphrases] for doc_id, keyphrases in target.items()}
test_data['class'] = [int(row['token'] in target[i.split(':')[0]]) for i, row in tqdm(test_data.iterrows(), total=len(test_data))]
test_data.head()

HBox(children=(IntProgress(value=0, max=161326), HTML(value='')))




Unnamed: 0_level_0,token,tf,idf,tf-idf,bm25,len,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2128:0,a,1.0,0.070422,0.070422,-3.176262,1,0
2128:1,optimal,0.166667,2.847312,0.474552,1.273193,7,0
2128:2,control,0.333333,2.024953,0.674984,1.09698,7,0
2128:3,algorithm,0.333333,1.883875,0.627958,1.16256,9,0
2128:4,base,0.166667,1.48722,0.24787,0.526747,4,0


## xgboost

In [15]:
import xgboost as xgb

In [16]:
X_train = train_data.loc[:, ~train_data.columns.isin(['class', 'id', 'token'])].values
y_train = train_data['class'].values
X_test = test_data.loc[:, ~test_data.columns.isin(['class', 'id', 'token'])].values
y_test = test_data['class'].values


D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)

In [17]:
y_train.shape

(10540,)

In [18]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)

weight = [class_weights[c] for c in y_train]

In [19]:
model = xgb.XGBClassifier(max_depth=5, gpu_id=0)
model.fit(X_train, y_train,
          sample_weight=weight,
          eval_set=[(X_test, y_test)],
          eval_metric='logloss', 
          verbose=True, 
          early_stopping_rounds=10)

[0]	validation_0-logloss:0.65981
Will train until validation_0-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.63077
[2]	validation_0-logloss:0.60614
[3]	validation_0-logloss:0.58604
[4]	validation_0-logloss:0.57053
[5]	validation_0-logloss:0.55516
[6]	validation_0-logloss:0.54226
[7]	validation_0-logloss:0.53500
[8]	validation_0-logloss:0.52804
[9]	validation_0-logloss:0.51767
[10]	validation_0-logloss:0.51128
[11]	validation_0-logloss:0.50338
[12]	validation_0-logloss:0.49903
[13]	validation_0-logloss:0.49523
[14]	validation_0-logloss:0.49152
[15]	validation_0-logloss:0.48576
[16]	validation_0-logloss:0.48350
[17]	validation_0-logloss:0.47882
[18]	validation_0-logloss:0.47668
[19]	validation_0-logloss:0.47330
[20]	validation_0-logloss:0.47193
[21]	validation_0-logloss:0.46771
[22]	validation_0-logloss:0.46500
[23]	validation_0-logloss:0.46110
[24]	validation_0-logloss:0.46008
[25]	validation_0-logloss:0.45870
[26]	validation_0-logloss:0.45753
[27]	validation_0-loglos

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='auto',
              verbosity=1)

In [20]:
res = model.predict(X_test, output_margin=True)
res_pred = {(i.split(':')[0], row['token']):  prob for (i, row), prob in zip(test_data.iterrows(), res)}

def score(t, d):
    """
    :param t: term
    :param d: document-id in test dataset
    """
    return res_pred[(d, t)] if (d, t) in res_pred else 0

score('literature', '193'), score('bikct', '193')

(-4.8050146, 0)

In [21]:
score('out-of-print', '193')

-0.9625013

In [22]:
def extract_keyphrases(doc_id, nb_keywords=5):
    scores = {candidate: score(candidate, doc_id) for candidate in test_candidates[doc_id]}
    scores = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)[:nb_keywords]
    return [keyphrase for keyphrase, score in scores]

predictions = {doc_id: extract_keyphrases(doc_id, nb_keywords=5) for doc_id, doc in tqdm(test_sentences.items())}

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




## Evaluate

In [23]:
predictions = {doc_id: [sno.stem(candidate) for candidate in candidates] for doc_id, candidates in predictions.items()}
target = {doc_id: [sno.stem(candidate) for candidate in candidates] for doc_id, candidates in target.items()}

In [24]:
predictions['193'], target['193']

(['strong collect',
  'weak collect',
  'print materi',
  'print literatur',
  'used materi'],
 ['out-of-print materi',
  'recurring issu',
  'changing practic',
  'out-of-print book',
  'library materi',
  'acquisit'])

In [25]:
def avg_precisoin(pred, targ):
    res, nb_correct = 0, 0
    for i, p in enumerate(pred):
        if p in targ:
            nb_correct += 1
            res += nb_correct / (i + 1)
    return res / len(targ)

In [26]:
results = []
for doc_id in sorted(predictions.keys()):
    p = set(predictions[doc_id])
    t = set(target[doc_id])

    # We always predict 5 keywords
    precision = 0 if len(p) == 0 else len(p.intersection(t)) / len(p)
    recall = 0 if len(t) == 0 else len(p.intersection(t)) / len(t)
    results.append({
        'doc_id':      doc_id,
        'precision':   precision,
        'recall':      recall,
        'f1':          0 if (precision + recall) == 0 else 2 * precision * recall / (precision + recall),
        'precision@5': len(p.intersection(t)) / 5.,
        'av_prec':     avg_precisoin(p, t)
    })

results = pd.DataFrame(results)
results.set_index('doc_id', inplace=True)

print('Precision: {:.2f} Recall: {:.2f} F1: {:.2f}   precision@5: {:.2f}  MAP: {:.2f}'.format(
    results["precision"].mean(),
    results["recall"].mean(),
    results["f1"].mean(),
    results["precision@5"].mean(),
    results["av_prec"].mean()
))
print('--------------Mean-------------')
results

Precision: 0.21 Recall: 0.11 F1: 0.14   precision@5: 0.19  MAP: 0.07
--------------Mean-------------


Unnamed: 0_level_0,precision,recall,f1,precision@5,av_prec
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
193,0.0,0.0,0.0,0.0,0.0
1930,0.2,0.125,0.153846,0.2,0.125
1931,0.4,0.4,0.4,0.4,0.4
1932,0.4,0.25,0.307692,0.4,0.125
1933,0.333333,0.2,0.25,0.2,0.2
1934,0.2,0.166667,0.181818,0.2,0.166667
1935,0.4,0.333333,0.363636,0.4,0.108333
1936,0.4,0.285714,0.333333,0.4,0.214286
1937,0.0,0.0,0.0,0.0,0.0
1938,0.25,0.333333,0.285714,0.2,0.166667
