In [2]:
import mteb
from sentence_transformers import SentenceTransformer
import os
import sys
sys.path.append('/root/MagicEmbed')
import subprocess
from datasets import load_dataset
from magicembed.utils import load_vocab_token_magic_scores
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [3]:
def show_dataset():
    # tasks = mteb.get_tasks(task_types=["PairClassification"],languages=["eng"])
    # tasks = mteb.get_tasks(task_types=["STS"],languages=["eng"])
    # tasks = mteb.get_tasks(languages=["eng"])
    # tasks = mteb.get_benchmark("MTEB(eng, classic)")
    tasks = mteb.get_tasks(languages=["eng"],
                           tasks=[
                "ArguAna",
                "NFCorpus",
                "SciFact",
                "StackOverflowDupQuestions",
                "SciDocsRR",
                "BiorxivClusteringS2S",
                "MedrxivClusteringS2S",
                "TwentyNewsgroupsClustering",
                "SprintDuplicateQuestions",
                "Banking77Classification",
                "EmotionClassification",
                "MassiveIntentClassification",
                "STS17",
                "SICK-R",
                "STSBenchmark",
                "SummEval"
            ]
                          )
    dataset_list = []
    for task in tasks:
        if task.metadata.name not in dataset_list:
            dataset_list.append(task.metadata.name)
            desc = 'name: {}\t\thf_name: {}\t\ttype: {}\t\tcategory: {}'.format(
                task.metadata.name, task.metadata.dataset['path'],
                task.metadata.type, task.metadata.category,
            )
            print(desc)
    print(len(dataset_list))
    print(dataset_list)
# show_dataset()

In [None]:
data_path = '/root/MagicEmbed/task_assess/data'

def download_dataset():
    # tasks = mteb.get_tasks(task_types=["PairClassification"],languages=["eng"])
    # tasks = mteb.get_tasks(task_types=["STS"],languages=["eng"])
    tasks = mteb.get_tasks(languages=["eng"],
                           tasks=[
                "ArguAna",
                "NFCorpus",
                "SciFact",
                "StackOverflowDupQuestions",
                "SciDocsRR",
                "BiorxivClusteringS2S",
                "MedrxivClusteringS2S",
                "TwentyNewsgroupsClustering",
                "SprintDuplicateQuestions",
                "Banking77Classification",
                "EmotionClassification",
                "MassiveIntentClassification",
                "STS17",
                "SICK-R",
                "STSBenchmark",
                "SummEval"
            ]
            )
    err_list = []
    for task in tasks:
        # task.load_data()
        # https://huggingface.co/datasets/
        task_name = task.metadata.dataset['path']
        print(task_name)
        cmd = ['huggingface-cli', 'download', '--repo-type', 'dataset', '--resume-download',
               '--local-dir-use-symlinks', 'False', task_name, '--local-dir', os.path.join(data_path, task_name),
               ]
        try:
            result = subprocess.run(cmd, check=True)
        except subprocess.CalledProcessError as e:
            err_list.append(task_name)
            print("{} is error".format(task_name))

    if err_list:
        print('download failed: \n', '\n'.join(err_list))
    else:
        print('download success.')
# download_dataset()

In [5]:
# Define the sentence-transformers model name
# model_name = "average_word_embeddings_komninos"
# or directly from huggingface:

# tasks = mteb.get_tasks(tasks=["Banking77Classification"])
# evaluation = mteb.MTEB(tasks=tasks)
# results = evaluation.run(model, output_folder=f"results/{model_name}")

In [6]:
# os.environ['HF_DATASETS_CACHE'] = data_path
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# model_path = '/root/autodl-fs/hf-checkpoints/sentence-transformers/sentence-t5-base'
# model_path = '/root/autodl-fs/hf-checkpoints/BAAI/bge-base-en-v1.5'
model_path = '/root/autodl-fs/hf-checkpoints/Alibaba-NLP/gte-base-en-v1.5'
# model_path = '/root/autodl-fs/hf-checkpoints/hkunlp/instructor-base'
# model_path = '/root/autodl-fs/hf-checkpoints/princeton-nlp/sup-simcse-bert-base-uncased'
# model_path = '/root/autodl-fs/hf-checkpoints/WhereIsAI/UAE-Large-V1'
# model_path = '/root/autodl-fs/hf-checkpoints/intfloat/e5-base'
# model_path = '/root/autodl-fs/hf-checkpoints/sentence-transformers/all-mpnet-base-v2'
# model_path = '/root/autodl-fs/hf-checkpoints/sentence-transformers/sentence-t5-base'
model_name = os.path.basename(model_path)
model = SentenceTransformer(model_path,trust_remote_code=True)

In [7]:
import pandas as pd
def get_sticky_token_list(model_name):
    sticky_tokens_of_all_models_df = pd.read_csv("../results/final_all_models_sticky_tokens.csv")
    sticky_token_column = sticky_tokens_of_all_models_df[sticky_tokens_of_all_models_df['model'] == model_name]
    sticky_token_list = sticky_token_column['raw_vocab'].to_list()
    return sticky_token_list

In [8]:
sticky_tokens = get_sticky_token_list(model_name)
sticky_tokens

['[PAD]',
 '[CLS]',
 ',',
 '>',
 '~',
 'ᄌ',
 '⁴',
 '₃',
 '₆',
 '₍',
 '₎',
 'ℓ',
 '⇌',
 '∞',
 '∩',
 '⊕',
 '■',
 '⺩',
 '立',
 '龸']

In [9]:
def get_benign_token_list(model_name, sticky_tokens):
    vocab_token_magic_scores = load_vocab_token_magic_scores(model_name)
    vocab_token_magic_scores_df = pd.DataFrame(vocab_token_magic_scores).transpose()
    # vocab_token_magic_scores_df的'main_metric'列升序排列，取前len(sticky_tokens)*5个
    vocab_token_magic_scores_df = vocab_token_magic_scores_df.sort_values(by='main_metric', ascending=True).head(len(sticky_tokens)*2)
    # 从中随机选择len(sticky_tokens)个
    selected_tokens_df = vocab_token_magic_scores_df.sample(n=len(sticky_tokens), random_state=42)
    return selected_tokens_df['raw_vocab'].to_list()

In [10]:
benign_token_list = get_benign_token_list(model_name, sticky_tokens)
benign_token_list

['commentator',
 'zev',
 'stefan',
 'clinton',
 'altitude',
 'gdansk',
 'smithsonian',
 'media',
 'chemicals',
 'pine',
 'anchor',
 'foot',
 'schultz',
 'calculus',
 'jensen',
 'catfish',
 'warn',
 'nagar',
 'parsons',
 'graphic']

In [11]:
# from magicembed.utils import load_vocab_verifications
# token_infos_with_metrics_and_part_verifications = load_vocab_verifications(model_name)
# import pandas as pd
# import pygwalker as pyg
# df_token_infos_with_metrics_and_part_verifications = pd.DataFrame(token_infos_with_metrics_and_part_verifications).T
# df_strong_rejected_sorted = df_token_infos_with_metrics_and_part_verifications[df_token_infos_with_metrics_and_part_verifications['magic'] == 'strong_verified'].sort_values(by='max_prob', ascending=False)
# df_strong_rejected_sorted.head(5)
# token_list=df_strong_rejected_sorted['decoded'][:20].tolist()
# token_list

In [12]:
# tasks = mteb.get_tasks(task_types=["PairClassification"],languages=["eng"],tasks=['SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus'])
# tasks = mteb.get_tasks(languages=["eng"],tasks=['BIOSSES', 'SICK-R', 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus'])
tasks = mteb.get_tasks(languages=["eng"],
                           tasks=[
                "ArguAna",
                "NFCorpus",
                "SciFact",
                "StackOverflowDupQuestions",
                "SciDocsRR",
                "BiorxivClusteringS2S",
                "MedrxivClusteringS2S",
                "TwentyNewsgroupsClustering",
                "SprintDuplicateQuestions",
                "Banking77Classification",
                "EmotionClassification",
                "MassiveIntentClassification",
                "STS16",
                "SICK-R",
                "STSBenchmark",
                "SummEval"
            ]
            )

# for task in tasks:
#     # print(task.metadata.eval_splits)
#     print(os.path.join(data_path, task.metadata.dataset['path']))
#     # print(task.metadata.dataset['revision'])
#     # print(task.metadata.dataset.get('trust_remote_code', False))
#     dataset = load_dataset(
#             path=os.path.join(data_path, task.metadata.dataset['path']),
#             revision=task.metadata.dataset['revision'],
#             trust_remote_code=task.metadata.dataset.get('trust_remote_code', None),
#     )
#     # print(task.metadata_dict["dataset"])
#     task.dataset = dataset
#     task.data_loaded = True

In [13]:
# evaluation[2].dataset[0]['sent2']

In [14]:
tasks

MTEBTasks(ArguAna(name='ArguAna', languages=['eng']), NFCorpus(name='NFCorpus', languages=['eng']), SciFact(name='SciFact', languages=['eng']), StackOverflowDupQuestions(name='StackOverflowDupQuestions', languages=['eng']), SciDocsReranking(name='SciDocsRR', languages=['eng']), BiorxivClusteringS2S(name='BiorxivClusteringS2S', languages=['eng']), MedrxivClusteringS2S(name='MedrxivClusteringS2S', languages=['eng']), TwentyNewsgroupsClustering(name='TwentyNewsgroupsClustering', languages=['eng']), SprintDuplicateQuestionsPC(name='SprintDuplicateQuestions', languages=['eng']), Banking77Classification(name='Banking77Classification', languages=['eng']), EmotionClassification(name='EmotionClassification', languages=['eng']), MassiveIntentClassification(name='MassiveIntentClassification', languages=['eng']), STS16STS(name='STS16', languages=['eng']), SickrSTS(name='SICK-R', languages=['eng']), STSBenchmarkSTS(name='STSBenchmark', languages=['eng']), SummEvalSummarization(name='SummEval', lang

In [15]:
tasks[0].data_loaded

False

In [16]:
# for split in tasks[1].dataset:
#     tasks[1].dataset[split] = tasks[1].dataset[split].map(lambda example: {"sentence2": [sentence + " hi" for sentence in example["sentence2"]]})

In [17]:
# tasks[0].dataset['test']['sentence2']

In [18]:
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(model, output_folder=f"/root/MagicEmbed/task_assess/results/{model_name}_with_benign_tokens", verbosity=1)
# print(results)

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Batches:   0%|          | 0/68 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Batches:   0%|          | 0/24 [00:00<?, ?it/s]



Batches:   0%|          | 0/697 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/918 [00:00<?, ?it/s]

Clustering:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Clustering:  10%|█         | 1/10 [00:03<00:33,  3.75s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Clustering:  20%|██        | 2/10 [00:07<00:29,  3.72s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Clustering:  30%|███       | 3/10 [00:11<00:26,  3.78s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Clustering:  40%|████      | 4/10 [00:15<00:22,  3.80s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Clustering:  50%|█████     | 5/10 [00:18<00:18,  3.76s/it]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Clustering:  60%|██████    | 6/10 [00:20<00:12,  3.15s/it]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Clustering:  70%|███████   | 7/10 [00:22<00:08,  2.75s/it]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Clustering:  80%|████████  | 8/10 [00:24<00:04,  2.49s/it]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Clustering:  90%|█████████ | 9/10 [00:26<00:02,  2.33s/it]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Clustering: 100%|██████████| 10/10 [00:28<00:00,  2.86s/it]
Clustering:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Clustering:  10%|█         | 1/10 [00:02<00:19,  2.18s/it]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Clustering:  20%|██        | 2/10 [00:04<00:17,  2.18s/it]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Clustering:  30%|███       | 3/10 [00:06<00:15,  2.20s/it]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Clustering:  40%|████      | 4/10 [00:08<00:13,  2.19s/it]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Clustering:  50%|█████     | 5/10 [00:10<00:10,  2.19s/it]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Clustering:  60%|██████    | 6/10 [00:12<00:07,  1.86s/it]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Clustering:  70%|███████   | 7/10 [00:13<00:04,  1.64s/it]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Clustering:  80%|████████  | 8/10 [00:14<00:02,  1.50s/it]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Clustering:  90%|█████████ | 9/10 [00:15<00:01,  1.39s/it]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Clustering: 100%|██████████| 10/10 [00:16<00:00,  1.69s/it]
Clustering:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Clustering:  10%|█         | 1/10 [00:00<00:02,  3.58it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Clustering:  20%|██        | 2/10 [00:00<00:03,  2.33it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Clustering:  30%|███       | 3/10 [00:01<00:03,  1.77it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Clustering:  40%|████      | 4/10 [00:02<00:04,  1.36it/s]

Batches:   0%|          | 0/43 [00:00<?, ?it/s]

Clustering:  50%|█████     | 5/10 [00:03<00:04,  1.14it/s]

Batches:   0%|          | 0/51 [00:00<?, ?it/s]

Clustering:  60%|██████    | 6/10 [00:05<00:04,  1.08s/it]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Clustering:  70%|███████   | 7/10 [00:06<00:03,  1.33s/it]

Batches:   0%|          | 0/69 [00:00<?, ?it/s]

Clustering:  80%|████████  | 8/10 [00:08<00:02,  1.50s/it]

Batches:   0%|          | 0/77 [00:00<?, ?it/s]

Clustering:  90%|█████████ | 9/10 [00:10<00:01,  1.65s/it]

Batches:   0%|          | 0/86 [00:00<?, ?it/s]

Clustering: 100%|██████████| 10/10 [00:13<00:00,  1.31s/it]


Batches:   0%|          | 0/684 [00:00<?, ?it/s]



Batches:   0%|          | 0/684 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/78 [00:00<?, ?it/s]

Batches:   0%|          | 0/78 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Scoring: 100%|██████████| 100/100 [00:01<00:00, 99.88it/s]


In [19]:
tasks

MTEBTasks(ArguAna(name='ArguAna', languages=['eng']), NFCorpus(name='NFCorpus', languages=['eng']), SciFact(name='SciFact', languages=['eng']), StackOverflowDupQuestions(name='StackOverflowDupQuestions', languages=['eng']), SciDocsReranking(name='SciDocsRR', languages=['eng']), BiorxivClusteringS2S(name='BiorxivClusteringS2S', languages=['eng']), MedrxivClusteringS2S(name='MedrxivClusteringS2S', languages=['eng']), TwentyNewsgroupsClustering(name='TwentyNewsgroupsClustering', languages=['eng']), SprintDuplicateQuestionsPC(name='SprintDuplicateQuestions', languages=['eng']), Banking77Classification(name='Banking77Classification', languages=['eng']), EmotionClassification(name='EmotionClassification', languages=['eng']), MassiveIntentClassification(name='MassiveIntentClassification', languages=['eng']), STS16STS(name='STS16', languages=['eng']), SickrSTS(name='SICK-R', languages=['eng']), STSBenchmarkSTS(name='STSBenchmark', languages=['eng']), SummEvalSummarization(name='SummEval', lang

In [20]:
#retrieval 0-2
tasks[2].corpus['test']

{'4983': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging. Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion co

In [21]:
#reranking 3-4
tasks[4].dataset['test']['negative'][1]

['nagarnagarAnalysis and Design of Average Current Mode Control Using a Describing-Function-Based Equivalent Circuit Model',
 'A Literature Review on Kidney Disease Prediction using Data Mining Classification Techniquegdanskgdansk',
 'Shape-aware Instance Segmentationschultz',
 'nagarResearch on continuous auditing: A bibliometric analysis',
 'anchoranchorWhich Hotel attributes Matter ? A review of previous and a framework for future research',
 'mediaChain Replication for Supporting High Throughput and Availability',
 'Real-Time 3D Reconstruction and 6-DoF Tracking with an Event Cameraanchor',
 'chemicalschemicalsBCSAT : A Benchmark Corpus for Sentiment Analysis in Telugu Using Word-level Annotations',
 'Dispositional Factors in Internet Use: Personality Versus Cognitive Stylesmithsonian',
 'Protein function in precision medicine: deep understanding with machine learning.smithsonian',
 'A framework based on RSA and AES encryption algorithms for cloud computing servicesschultzschultz',

In [22]:
#classification
tasks[10].dataset['test']['text']

['im feeling rather rotten so im not very ambitious right nowfootfoot',
 'clintonim updating my blog because i feel shitty',
 'i never make her separate from me because i don t ever want her to feel like i m ashamed with hernagarnagarnagar',
 'i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrivedmediamediamedia',
 'i was feeling a little vain when i did this onezevzev',
 'i cant walk into a shop anywhere where i do not feel uncomfortableschultzschultz',
 'anchoranchori felt anger when at the end of a telephone call',
 'i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted into the masters program at the university of virginiacommentatorcommentatorcommentatorcommentator',
 'i like to have the same breathless feeling as a reader eager to see what will happen nextsmithsoniansmithsonian',
 'i jest i feel grumpy tired a

In [23]:
#clustering
tasks[7].dataset['test']['sentences']

[['clintonMotorola MC143150 and MC143120',
  'chemicalsWindows 3.1(new) for sale $35',
  'mediaGospel Dating',
  'clintonHelp with ultra-long timing',
  'Cirrus Logic 5426 Graph Cardparsons',
  'zevWhat is Zero dB????',
  'stefanWings will win',
  'mediamediaTo be exact, 2.5 million Muslims were exterminated by the Armenians.',
  'nagarMorality? (was Re: <Political Atheists?)',
  'Eco-Freaks forcing Space Mining.nagar',
  'Is MSG sensitivity superstition?media',
  'SGI sales practices (Was: Crimson (Was: Kubota Announcement?))parsons',
  'commentatorBATF/FBI revenge',
  'Braves Pitching UpdateDIRcalculus',
  'chemicalsTravesty at the Joe Louis',
  'W32/TSENG card & other VLB cards... (please)smithsonian',
  'clintonMonophysites and Mike Walker',
  'schultzJeep Grand vs. Toyota 4-Runner',
  'Looking For David E. Smythfoot',
  'Plus minus stat...graphic',
  'Blues into playoffs, beat TB 6-5zev',
  'nagarMitsumi CD ROM drivers fix for QEMM',
  'schultzNon-word password generator',
  'Swim

In [24]:
#sts, pairclassification
tasks[8].dataset['test']['sentence2']

[['How do I know if my Franklin U772 USB Modem has a weak signal ?pinepine',
  'Turn GPS on or off - Kyocera DuraMaxnagar',
  'gdanskMake a conference call - Google Nexus 5',
  'Make a conference call - Google Nexus 5graphic',
  'graphicTransfer media files with your Windows computer - Samsung Trender',
  'Transfer media files with your Windows computer - Samsung Trendergraphic',
  'calculusWhy is my HTC One freezing or being unresponsive ?',
  'anchorWhy is my HTC One freezing or being unresponsive ?',
  'Troubleshoot shortened battery life on your R850 Mobile Hotspotzev',
  'Program your Samsung Galaxy S IIIclinton',
  'Activate your device - LG Tribute DUO Prepaidgdansk',
  'Troubleshoot issues related to low internal memory on your HTC Desire 816chemicalschemicals',
  'Change the backup and sync settings - Samsung Galaxy Core Prime Prepaidzevzev',
  'chemicalschemicalsChange the backup and sync settings - Samsung Galaxy Core Prime Prepaid',
  'Change the backup and sync settings - 

In [25]:
#reranking
tasks[4].dataset['test']['negative'][1]

['nagarnagarAnalysis and Design of Average Current Mode Control Using a Describing-Function-Based Equivalent Circuit Model',
 'A Literature Review on Kidney Disease Prediction using Data Mining Classification Techniquegdanskgdansk',
 'Shape-aware Instance Segmentationschultz',
 'nagarResearch on continuous auditing: A bibliometric analysis',
 'anchoranchorWhich Hotel attributes Matter ? A review of previous and a framework for future research',
 'mediaChain Replication for Supporting High Throughput and Availability',
 'Real-Time 3D Reconstruction and 6-DoF Tracking with an Event Cameraanchor',
 'chemicalschemicalsBCSAT : A Benchmark Corpus for Sentiment Analysis in Telugu Using Word-level Annotations',
 'Dispositional Factors in Internet Use: Personality Versus Cognitive Stylesmithsonian',
 'Protein function in precision medicine: deep understanding with machine learning.smithsonian',
 'A framework based on RSA and AES encryption algorithms for cloud computing servicesschultzschultz',

In [26]:
#retrieval
tasks[0].corpus['test']

{'test-environment-aeghhgwpe-pro02b': "clintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonclintonanimals environment general health health general weight philosophy ethics You don’t have to be vegetarian to be green. Many special environments have been created by livestock farming – for example chalk down land in England and mountain pastures in many countries. Ending livestock farming would see these areas go back to woodland with a loss of many unique plants and animals. Growing crops can also be very bad for the planet, with fertilisers and pesticides polluting rivers, lakes and seas. Most tropical forests are now cut down for timber, or to allow oil palm trees to be grown in plantations, not to create space for meat production.  British farmer and former editor Simon Farrell also states: “Many vegans and vegetar

In [27]:
#sts
tasks[14].dataset['test']['sentence2']

['clintonA girl is brushing her hair.',
 'chemicalsA group of boys are playing soccer on the beach.',
 "mediaA woman measures another woman's ankle.",
 'clintonA man is slicing a cucumber.',
 'A man is playing a keyboard.parsons',
 'zevA woman is cutting tofu.',
 'stefanA man is riding a bicycle.',
 'mediaA man is playing the guitar.',
 'nagarA lady is playing the guitar.',
 'A man is playing a trumpet.nagar',
 'A man is playing a trumpet.media',
 'A man cuts an onion.parsons',
 'commentatorA man is talking.',
 'A man is cutting up a fish.calculus',
 'chemicalsA man is slicing a bun.',
 'A man is playing a keyboard.smithsonian',
 'clintonA panda slides down a slide.',
 'schultzA man is playing a guitar.',
 'A man slaps a woman.foot',
 'A man is riding a horse.graphic',
 'A woman is cutting an onion.zev',
 'nagarThe woman is slicing herbs.',
 'schultzTwo zebras are playing in a field.',
 'A man is slicing some potato.nagar',
 'A woman is slicing a pumpkin.graphic',
 'parsonsA man and wo

In [28]:
#summarization
tasks[0].dataset['test']['machine_summaries']

TypeError: 'NoneType' object is not subscriptable