# Create NLP Leaderboards #

We want to take NLP papers and assign them to a leaderboard by task (T), dataset (D), and evaluation metric (M).

In [1]:
# imports
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score, confusion_matrix

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  from numpy.core.umath_tests import inner1d


In [2]:
# load train/test splits
train_fn = "../../data/exp/train.tsv"
test_fn = "../../data/exp/test.tsv"

train_df = pd.read_csv(train_fn, sep='\t', header=None, names=['file', 'title', 'abstract', 'exp_data', 'table', 
                                                               'task', 'dataset', 'metric'])
test_df = pd.read_csv(test_fn, sep='\t', header=None, names=['file', 'title', 'abstract', 'exp_data', 'table', 
                        'task', 'dataset', 'metric'])


#print(train_df[:10])


In [3]:
# print stats on train and test sets?
print("Train:")
print("Files: ", train_df['file'].nunique())
print("Tasks: ", train_df['task'].nunique())
print("Datasets: ", train_df['dataset'].nunique())
print("Metrics: ", train_df['metric'].nunique())
print("Test:")
print("Files: ", test_df['file'].nunique())
print("Tasks: ", test_df['task'].nunique())
print("Datasets: ", test_df['dataset'].nunique())
print("Metrics: ", test_df['metric'].nunique())
print("Note, this is not splitting on '#' so there are fewer tasks, datasets, and metrics.")

Train:
Files:  164
Tasks:  44
Datasets:  95
Metrics:  54
Test:
Files:  169
Tasks:  43
Datasets:  91
Metrics:  53
Note, this is not splitting on '#' so there are fewer tasks, datasets, and metrics.


In [4]:
# combine text fields into one
train_df['all_text'] = train_df['title'].map(str) + train_df['abstract'].map(str) + train_df['exp_data'].map(str) + train_df['table'].map(str)
test_df['all_text'] = test_df['title'].map(str) + test_df['abstract'].map(str) + test_df['exp_data'].map(str) + test_df['table'].map(str)
# drop old text fields?

In [5]:
# apply tfidfvectorizer to get features
vectorizer = TfidfVectorizer(sublinear_tf=False, max_df=0.95)
# fit-transform on all data
#all_data = np.concatenate([train_df['all_text'].values, test_df['all_text'].values])
#all_data = train_df['all_text'].values
#train_len = len(train_df['all_text'])
#x_train = all_x[:train_len]
#x_test = all_x[train_len:]

# fit on train
vectorizer = vectorizer.fit(train_df['all_text'].values)
x_train = vectorizer.transform(train_df['all_text'].values)
x_test = vectorizer.transform(test_df['all_text'].values)
print("Number of features: ", x_train.shape[1])
print("Number of (train) samples: ", x_train.shape[0])

Number of features:  6962
Number of (train) samples:  164


In [6]:
# get instance labels: multilabel or not?
# task, data, and metrics label
le = {}  # dict for label encoders
y = {}  # dict for labels
y_test = {}
for label in ['task', 'dataset', 'metric']:
    # old single label case
#     le[label] = LabelEncoder().fit(train_df[label].tolist() + test_df[label].tolist())
#     y[label] = le[label].transform(train_df[label].tolist())
#     y_test[label] = le[label].transform(test_df[label].tolist())
    # multilabel
    le[label] = MultiLabelBinarizer().fit([
        set(l.split('#')) for l in train_df[label].tolist()+test_df[label].tolist()])
    y[label] = le[label].transform([set(l.split('#')) for l in train_df[label].tolist()])
    y_test[label] = le[label].transform([set(l.split('#')) for l in test_df[label].tolist()])


In [7]:
le['metric'].classes_

array(['Acc ex', 'Accuracy', 'Accuracy on Average', 'Accuracy on Books',
       'Accuracy on CNN', 'Accuracy on DVD', 'Accuracy on Daily Mail',
       'Accuracy on Dev', 'Accuracy on Electronics',
       'Accuracy on Kitchen', 'Accuracy on RACE', 'Accuracy on RACE-h',
       'Accuracy on RACE-m', 'Accuracy on Test', 'Aspect (F1)',
       'Avg Accuracy', 'Avg F1', 'BLEU', 'BLEU-1', 'BLEU-4',
       'Bit per Character (BPC)', 'CR', 'EM', 'EM (Quasar-T)', 'Error',
       'F0.5', 'F1', 'F1 (Quasar-T)', 'F1 (surface form)', 'F1 on Full',
       'F1 on Newswire', 'F1-score', 'H@1', 'H@10', 'Joint goal Accuracy',
       'LAS', 'Laptop (acc)', 'MAP', 'METEOR', 'MRR', 'Matched Accuracy',
       'Micro-Precision', 'Mismatched Accuracy', 'N-gram F1',
       'Number of params', 'P@10%', 'P@30%', 'P@5', 'POS',
       'Pearson Correlation', 'Precision', 'Query Split',
       'Question Split', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'R_10@1',
       'R_2@1', 'Recall', 'Request Accuracy', 'Restaurant (acc)',

In [8]:
# cross validation to find good hyperparameters
# random shuffling might not be ideal for splitting this data, but I don't think we have another option

simple_clf = RandomForestClassifier()
# simple_clf = LogisticRegressionCV()

clf = {}  # dict for classifier (by label)
for label in ['task', 'dataset', 'metric']:
    clf[label] = MultiOutputClassifier(simple_clf, n_jobs=-1).fit(x_train, y[label])
#     clf[label] = RandomForestClassifier(n_jobs=-1).fit(x_train, y[label])
    print("Trained model for ", label)
#     print(clf[label].get_params())
#     print(clf[label].C_)

Trained model for  task
Trained model for  dataset
Trained model for  metric


In [9]:
pred = {}
pred_prob = {}
for label in ['task', 'dataset', 'metric']:
    pred[label] = clf[label].predict(x_test)
    pred_prob[label] = clf[label].predict_proba(x_test)
    score = f1_score(y_test[label], pred[label], average='macro')
    print(score)
#     conf_mat = confusion_matrix(metric_y, train_pred)
#     np.savetxt("/tmp/foo.csv", conf_mat, delimiter=",")

0.07160987160987162


  'precision', 'predicted', average, warn_for)


0.007579787234042554


  'recall', 'true', average, warn_for)


0.05126828490467829


In [10]:
def update_pred(pred, pred_prob):
    update_pred = {}
    for label in ['task', 'dataset', 'metric']:
        mask = np.sum(pred[label], axis=1) > 0
        pred_prob_np = None
        for i in pred_prob[label]:  # use for-loop to make sure all have the right shape
            if i.shape[1] == 1:
                i = np.concatenate((i,np.zeros((i.shape[0],1))), axis=1)
            assert i.shape[1] == 2, "Only expecting 2 columns (at this point) but we have shape: %s" % str(i.shape)
            if pred_prob_np is None:
                pred_prob_np = i
            else:
                pred_prob_np = np.concatenate((pred_prob_np, i))
        pred_prob_np = pred_prob_np.reshape((len(pred_prob[label]),169,2))
        am = np.argmax(pred_prob_np, axis=0)[:,1]  # need max of second column, prob label is true
        oh = np.zeros(pred[label].shape)
        oh[np.arange(pred[label].shape[0]), am] = 1
        print(((oh+pred[label])>0)[:5])
        update_pred[label] = (oh + pred[label])>0
    return update_pred

update_pred = update_pred(pred, pred_prob)
update_pred['metric']


[[False False False False False False False False False False False False
  False False False False  True False False False False False False False
  False False False False False False False False False False False]
 [False False False False False False False False False False False False
  False False False False False False  True False False False False False
  False False False False False False False False False False False]
 [False False False False False False False False False False False False
  False False False  True False False False False False False False False
  False False False False False False False False False False False]
 [False False False False False False False False False False False False
  False False False False False False  True False False False False False
  False False False False False False False False False False False]
 [False False False False False False False False False False False False
  False False False False False False False False False Fa

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False,  True, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [11]:
# dump to CSV for evaluation
# TODO vectorize or use new dataframe
# want to write paper name, task, dataset, evaluation metric, score
#for x in test_df['file'].tolist():
#    print 
csv_df = test_df['file'].copy()
for label in ['task', 'dataset', 'metric']:
    csv_df = pd.concat((csv_df, pd.Series(le[label].inverse_transform(update_pred[label]))), axis=1)
csv_df['score'] = '0.0'
csv_df.columns = ['file', 'task', 'dataset', 'metric', 'score']
csv_df.to_csv("/tmp/output.csv", index=False)