# Create NLP Leaderboards #

We want to take NLP papers and assign them to a leaderboard by task (T), dataset (D), and evaluation metric (M).

In [1]:
# imports
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  from numpy.core.umath_tests import inner1d


In [2]:
# load train/test splits
# train_fn = "../../data/exp1/train_positive.tsv"
# test_fn = "../../data/exp1/test_positive.tsv"
train_fn = "../../data/exp1/ablationfull/train.tsv"
test_fn = "../../data/exp1/ablationfull/test.tsv"


tmp_train_df = pd.read_csv(train_fn, sep='\t', header=None, names=['T_F', 'file', 'label', 'all_text'])
tmp_test_df = pd.read_csv(test_fn, sep='\t', header=None, names=['T_F', 'file', 'label', 'all_text'])

print("Size of train df: ", tmp_train_df.shape[0])
print("Size of test df: ", tmp_test_df.shape[0])

# remove false labels, not needed for multiclass classification
tmp_train_df.drop(tmp_train_df[tmp_train_df['T_F'] == False].index, inplace=True)
tmp_test_df.drop(tmp_test_df[tmp_test_df['T_F'] == False].index, inplace=True)

print("Size of train df: ", tmp_train_df.shape[0])
print("Size of test df: ", tmp_test_df.shape[0])
#print(train_df[:10])


Size of train df:  13260
Size of test df:  12636
Size of train df:  371
Size of test df:  325


In [3]:
# loop over rows and merge labels for same instances
unk_limit = 3
unk_seen = 0
train_instance_d = {}
for index, row in tmp_train_df.iterrows():
    if row['label'] == 'unknow':
        if unk_seen < unk_limit:
            unk_seen += 1
        else:
            continue  # don't train on unknown labels
    if row['file'] in train_instance_d:
        if row['all_text'] in train_instance_d[row['file']]:
            train_instance_d[row['file']][row['all_text']].append(row['label'])
        else:
            assert False, "All files should match one and only one text."
            train_instance_d[row['file']][row['all_text']] = [row['label']]
    else:
        train_instance_d[row['file']] = {row['all_text']: [row['label']]}

# loop over rows and merge labels for same instances
test_instance_d = {}
for index, row in tmp_test_df.iterrows():
    if row['file'] in test_instance_d:
        if row['all_text'] in test_instance_d[row['file']]:
            test_instance_d[row['file']][row['all_text']].append(row['label'])
        else:
            assert False, "All files should match one and only one text."
            test_instance_d[row['file']][row['all_text']] = [row['label']]
    else:
        test_instance_d[row['file']] = {row['all_text']: [row['label']]}

print("Train files: ", len(train_instance_d))
print("Test files: ", len(test_instance_d))
# print("Train instances: ", sum([len(v.items()) for k,v in train_instance_d.items()]))
# print("Test instances: ", sum([len(v.items()) for k,v in test_instance_d.items()]))
print(train_instance_d['trouillon16.pdf']['Complex Embeddings for Simple Link Prediction In statistical relational learning, the link prediction problem is key to automatically understand the structure of large knowledge bases. As in previous studies, we propose to solve this problem through latent factorization. However, here we make use of complex valued embeddings. The composition of complex embeddings can handle a large variety of binary relations, among them symmetric and antisymmetric relations. Compared to state-of-the-art models such as Neural Tensor Network and Holographic Embeddings, our approach based on complex embeddings is arguably simpler, as it only uses the Hermitian dot product, the complex counterpart of the standard dot product between real vectors. Our approach is scalable to large datasets as it remains linear in both space and time, while consistently outperforming alternative approaches on standard link prediction benchmarks. 1 In order to evaluate our proposal, we conducted experiments on both synthetic and real datasets The synthetic dataset is based on relations that are either symmetric or antisymmetric, whereas the real datasets comprise different types of relations found in different, standard KBs Dataset We next evaluate the performance of our model on the FB15K and WN18 datasets summarizes the metadata of the two datasets Both datasets contain only positive triples For evaluation, we measure the quality of the ranking of each test triple among all possible subject and object substitutions : r(s , o) and r(s, o ), ∀s , ∀o ∈ E Mean Reciprocal Rank (MRR) and Hits at mare the standard evaluation measures for these datasets and come in two flavours: raw and filtered) We report both filtered and raw MRR, and filtered Hits at 1, 3 and 10 in for the evaluated models Furthermore, we chose TransE, DistMult Table 3. Number of entities, relations, and observed triples in each split for the FB15K and WN18 datasets. |R| Table 2. Filtered and Raw Mean Reciprocal Rank (MRR) for the models tested on the FB15K and WN18 datasets. Hits@m metrics are filtered. *Results reported from (Nickel et al., 2016b) for HolE model. 1 FB15K 3 Filter WN18 Hits at Raw MRR 10 Table 4. Filtered Mean Reciprocal Rank (MRR) for the models tested on each relation of the Wordnet dataset (WN18). ComplEx DistMult TransE'])

Train files:  127
Test files:  162
['relation prediction, FB15K-237, H@1', 'relation prediction, FB15K-237, H@10', 'relation prediction, WN18RR, H@10', 'relation prediction, WN18RR, H@1', 'relation prediction, FB15K-237, MRR', 'relation prediction, WN18RR, MRR']


In [4]:
def parse_labels(labels):
    tasks = set()
    datasets = set()
    metrics = set()
    for label in labels:
        if label == "unknow":
            task, dataset, metric = label, label, label
        else:
            try:
                task, dataset, metric = label.split(',')
            except ValueError:
                print("Unable to parse label: ", label)
        tasks.add(task.strip())
        datasets.add(dataset.strip())
        metrics.add(metric.strip())
    return tasks, datasets, metrics
        
# convert to dictionaries for creating new dataframe
train_d = {'file':[], 'all_text':[], 'task':[], 'dataset':[], 'metric':[]}
test_d = {'file':[], 'all_text':[], 'task':[], 'dataset':[], 'metric':[]}

for f,t_d in train_instance_d.items():
    assert len(t_d) == 1, "Should only be one entry in this dict."
    for t, labels in t_d.items():  # should only be one
        train_d['file'].append(f)
        train_d['all_text'].append(t)
        tasks, datasets, metrics = parse_labels(labels)
        train_d['task'].append(tasks)
        train_d['dataset'].append(datasets)
        train_d['metric'].append(metrics)
        
for f,t_d in test_instance_d.items():
    assert len(t_d) == 1, "Should only be one entry in this dict."
    for t, labels in t_d.items():  # should only be one
        test_d['file'].append(f)
        test_d['all_text'].append(t)
        tasks, datasets, metrics = parse_labels(labels)
        test_d['task'].append(tasks)
        test_d['dataset'].append(datasets)
        test_d['metric'].append(metrics)
        
train_df = pd.DataFrame(data=train_d)
test_df = pd.DataFrame(data=test_d)

In [5]:
# print stats on train and test sets?
print("Train:")
print("Files: ", train_df['file'].nunique())
print("Tasks: ", len(set([t for sublist in train_df['task'].tolist() for t in sublist])))
print("Datasets: ", len(set([t for sublist in train_df['dataset'].tolist() for t in sublist])))
print("Metrics: ", len(set([t for sublist in train_df['metric'].tolist() for t in sublist])))
print("Test:")
print("Files: ", test_df['file'].nunique())
print("Tasks: ", len(set([t for sublist in test_df['task'].tolist() for t in sublist])))
print("Datasets: ", len(set([t for sublist in test_df['dataset'].tolist() for t in sublist])))
print("Metrics: ", len(set([t for sublist in test_df['metric'].tolist() for t in sublist])))


Train:
Files:  127
Tasks:  19
Datasets:  45
Metrics:  31
Test:
Files:  162
Tasks:  19
Datasets:  45
Metrics:  31


In [6]:
# apply tfidfvectorizer to get features
vectorizer = TfidfVectorizer(sublinear_tf=False, max_df=0.95)
# fit-transform on all data
#all_data = np.concatenate([train_df['all_text'].values, test_df['all_text'].values])
#all_data = train_df['all_text'].values
#train_len = len(train_df['all_text'])
#x_train = all_x[:train_len]
#x_test = all_x[train_len:]

# fit on train
vectorizer = vectorizer.fit(train_df['all_text'].values)
x_train = vectorizer.transform(train_df['all_text'].values)
x_test = vectorizer.transform(test_df['all_text'].values)
print("Number of features: ", x_train.shape[1])
print("Number of (train) samples: ", x_train.shape[0])

Number of features:  4349
Number of (train) samples:  127


In [7]:
# get instance labels: multilabel or not?
# task, data, and metrics label
le = {}  # dict for label encoders
y = {}  # dict for labels
y_test = {}
unk_index = {}
for label in ['task', 'dataset', 'metric']:
    # old single label case
#     le[label] = LabelEncoder().fit(train_df[label].tolist() + test_df[label].tolist())
#     y[label] = le[label].transform(train_df[label].tolist())
#     y_test[label] = le[label].transform(test_df[label].tolist())
    # multilabel
    le[label] = MultiLabelBinarizer().fit(train_df[label].tolist() + test_df[label].tolist())
    y[label] = le[label].transform(train_df[label].tolist())
    y_test[label] = le[label].transform(test_df[label].tolist())
    unk_index[label] = np.nonzero(le[label].classes_ == 'unknow')[0][0]

In [8]:
# print shapes, sizes
print("X:", x_train.shape, x_test.shape)
print("y:", y['task'].shape, y_test['task'].shape)
for label in ['task', 'dataset', 'metric']:
    print(le[label].classes_)
    print(unk_index[label])

X: (127, 4349) (162, 4349)
y: (127, 19) (162, 19)
['amr parsing' 'ccg supertagging' 'chunking' 'constituency parsing'
 'dependency parsing' 'language modeling' 'machine translation'
 'named entity recognition' 'part-of-speech tagging' 'question answering'
 'relation prediction' 'relationship extraction' 'sentiment analysis'
 'summarization' 'taxonomy learning' 'text classification' 'unknow'
 'word segmentation' 'word sense disambiguation']
16
['1B Words / Google Billion Word benchmark' 'AG News' 'CCGBank'
 'CNN / Daily Mail (Anonymized version)'
 'CNN / Daily Mail (Non-anonymized version)' 'Chinese Treebank 6'
 'CoNLL 2003 (English)' 'DBpedia' 'DUC 2004 Task 1' 'FB15K-237' 'Gigaword'
 'Hutter Prize' 'IMDb' 'LDC2014T12' 'LDC2015E86' 'MSR'
 'New York Times Corpus' 'Ontonotes v5 (English)' 'PKU' 'Penn Treebank'
 'Quasar' 'SQuAD' 'SST-2' 'SUBJ' 'SearchQA' 'SemEval 2007' 'SemEval 2013'
 'SemEval 2015' 'SemEval 2018' 'SemEval-2010 Task 8'
 'SemEval-2014 Task 4 subtask 2 Aspect Term Polarity'

In [9]:
# cross validation to find good hyperparameters
# random shuffling might not be ideal for splitting this data, but I don't think we have another option

# simple_clf = RandomForestClassifier()
simple_clf = LogisticRegression(solver="saga", multi_class="multinomial")

clf = {}  # dict for classifier (by label)
for label in ['task', 'dataset', 'metric']:
    clf[label] = MultiOutputClassifier(simple_clf, n_jobs=-1).fit(x_train, y[label])
#     clf[label] = RandomForestClassifier(n_jobs=-1).fit(x_train, y[label])
    print("Trained model for ", label)
#     print(clf[label].get_params())
#     print(clf[label].C_)



Trained model for  task






Trained model for  dataset




Trained model for  metric


In [10]:
# run prediction on a couple of examples from the test data
print("Raw text: ", test_df['all_text'][0])
print("Some features: ")
for label in ['task', 'dataset', 'metric']:
    print("Gold label: ", test_df[label][0], " (binary) ", y_test[label][0])
    print("Prediction: ", clf[label].predict(x_test[0]))
    print("Prediction prob.: ", clf[label].predict_proba(x_test[0]))


Raw text:  Universal Sentence Encoder We present models for encoding sentences into embedding vectors that specifically target transfer learning to other NLP tasks. The models are efficient and result in accurate performance on diverse transfer tasks. Two variants of the encoding models allow for trade-offs between accuracy and compute resources. For both variants , we investigate and report the relationship between model complexity, resource consumption, the availability of transfer task training data, and task performance. Comparisons are made with base-lines that use word level transfer learning via pretrained word embeddings as well as baselines do not use any transfer learning. We find that transfer learning using sentence embeddings tends to outperform word level transfer. With transfer learning via sentence embeddings, we observe surprisingly good performance with minimal amounts of supervised training data fora transfer task. We obtain encouraging results on Word Embedding Asso

In [11]:
# run prediction on all test data
pred = {}
pred_prob = {}
for label in ['task', 'dataset', 'metric']:
    pred[label] = clf[label].predict(x_test)
    pred_prob[label] = clf[label].predict_proba(x_test)
    score = f1_score(y_test[label], pred[label], average='macro')
    print(score)
#     conf_mat = confusion_matrix(metric_y, train_pred)
#     np.savetxt("/tmp/foo.csv", conf_mat, delimiter=",")

0.03508771929824561


  'precision', 'predicted', average, warn_for)


0.0018518518518518517
0.08143839238498148


In [12]:
def update_pred(pred, pred_prob):
    update_pred = {}
    for label in ['task', 'dataset', 'metric']:
        mask = np.sum(pred[label], axis=1) > 0
        pred_prob_np = None
        for i in pred_prob[label]:  # use for-loop to make sure all have the right shape
            if i.shape[1] == 1:
                i = np.concatenate((i,np.zeros((i.shape[0],1))), axis=1)
            assert i.shape[1] == 2, "Only expecting 2 columns (at this point) but we have shape: %s" % str(i.shape)
            if pred_prob_np is None:
                pred_prob_np = i
            else:
                pred_prob_np = np.concatenate((pred_prob_np, i))
        pred_prob_np = pred_prob_np.reshape((len(pred_prob[label]),-1,2))
        am = np.argmax(pred_prob_np, axis=0)[:,1]  # need max of second column, prob label is true
        oh = np.zeros(pred[label].shape)
        oh[np.arange(pred[label].shape[0]), am] = 1
        print(((oh+pred[label])>0)[:5])
        update_pred[label] = (oh + pred[label])>0
    return update_pred

update_pred = update_pred(pred, pred_prob)
update_pred['metric']


[[False False False False False False False  True False False False False
  False False False False False False False]
 [False False False False False False False False False False False False
   True False False False False False False]
 [False False False False  True False False False False False False False
  False False False False False False False]
 [False False False False False  True False False False False False False
  False False False False False False False]
 [False False False False False False False  True False False False False
  False False False False False False False]]
[[False False False False False False False False False False False False
  False False False False False False False  True False False False False
  False False False False False False False False False False False False
  False False False False False False False False False]
 [False False False False False False False False False False False False
  False False False False False False False  True F

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [13]:
# dump to CSV for evaluation
# TODO vectorize or use new dataframe
# want to write paper name, task, dataset, evaluation metric, score
#for x in test_df['file'].tolist():
#    print 
csv_df = test_df['file'].copy()
for label in ['task', 'dataset', 'metric']:
    csv_df = pd.concat((csv_df, pd.Series(le[label].inverse_transform(update_pred[label]))), axis=1)
csv_df['score'] = '0.0'
csv_df.columns = ['file', 'task', 'dataset', 'metric', 'score']
csv_df.to_csv("/tmp/output.csv", index=False)