# PART 1 (Training)



# 1. Introduction

This challenge is about datasets used in scientific papers. In particular, we want to extract the datasets for scientific paper, with several NLP approaches. In this notebook, we make use of SciBERT, introduced by Beltagy, I., Lo, K., and Cohan, A. in 2019 [1]. Source code of SciBERT can be found [here](https://github.com/allenai/scibert).

Furthermore, we append the existing data with a specialized Corpus for dataset tagging. TDMSci is a Corpus existing of annotated data for tasks, metrices and datasets. Here, B-DATASET and I-DATASET are the NER-labels indicating a word is (part of) a dataset [2]. Source code (and annotated data) of TDMSci can be found [here](https://github.com/IBM/science-result-extractor).

We try two approaches to tackle the problem:
-  Named Entity Recognition (NER);
-  Masked Language Model (MLM) Classification;

We have created three notebooks, one for **dataset creation** (Part 0), one for **training** (Part I) and one for **testing** (Part II). This notebook encounters the traning phase of our model(s).


[1] Beltagy, I., Lo, K., & Cohan, A. (2019). SciBERT: A pretrained language model for scientific text. arXiv preprint arXiv:1903.10676.  
[2] Hou, Y., Jochim, C., Gleize, M., Bonin, F., & Ganguly, D. (2021). TDMSci: A Specialized Corpus for Scientific Literature Entity Tagging of Tasks Datasets and Metrics. arXiv preprint arXiv:2101.10273.


# 2. Preparing Notebook

In [1]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets 
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl 
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl 
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl 
!pip install datasets 

Looking in links: file:///kaggle/input/coleridge-packages/packages/datasets
Processing /kaggle/input/coleridge-packages/packages/datasets/datasets-1.5.0-py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/tqdm-4.49.0-py2.py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/huggingface_hub-0.0.7-py3-none-any.whl
Installing collected packages: tqdm, xxhash, huggingface-hub, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.56.2
    Uninstalling tqdm-4.56.2:
      Successfully uninstalled tqdm-4.56.2
Successfully installed datasets-1.5.0 huggingface-hub-0.0.7 tqdm-4.49.0 xxhash-2.0.0
Processing /kaggle/input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Processing /kaggle/input/coleridge-packages/tokenizers-

In [2]:
#Import necessary libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
import re
import os
from os import listdir

from os.path import isfile, join
import re
import json
import time
import datetime
import random
import glob
import importlib
import allennlp
import numpy as np
import pandas as pd
from transformers import *
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import torch
#from datasets import load_dataset


  '"sox" backend is being deprecated. '


In [3]:
model_name = 'allenai/scibert_scivocab_cased'
#model_name =  'bert-base-cased'
#Initialize paths for data
path_abs = '/kaggle/input/coleridgeinitiative-show-us-the-data/'
path_train = os.path.join(path_abs,'train/')
path_train_metadata = os.path.join(path_abs, 'train.csv')
path_test = os.path.join(path_abs, 'test/')
path_sample_submission = os.path.join(path_abs, 'sample_submission.csv')

path_abs_tdmsci = '/kaggle/input/tdmsci/'
path_test_tdmsci = os.path.join(path_abs_tdmsci, 'test_500_v2.txt')
path_train_tdmsci = os.path.join(path_abs_tdmsci,'train_1500_v2.txt')
#path_train_nerjson = os.path.join(path_abs_tdmsci, 'train_ner.json')
#path_train_nerjson = '/kaggle/working/train_ner.json'
path_train_nerjson = '../input/fork-of-mlip-group25-scibert-dataset/train_ner.json'


In [4]:
acc = 0
labels = []
with open(path_train_nerjson) as f:
    for row in f:
        rowjson = json.loads(row)
        if(acc == 0):
            print(rowjson)
        labels+= rowjson["tags"]
        acc += 1

{'tokens': ['ADNI', 'Mueller', 'et', 'al', '2005', 'into', 'the', 'analysis', 'base', 'such', 'that', 'these', 'datasets', 'which', 'are', 'actually', 'stored', 'in', 'their', 'entirety', 'on', 'the', 'N4U', 'Grid', 'infrastructure', 'or', 'other', 'similar', 'repositories', 'become', 'indexed', 'in', 'the', 'analysis', 'base'], 'tags': ['B-DATASET', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'id': '018e6c55-7704-4332-8084-ec53dc457b4b'}


In [5]:
print(acc)
print(np.unique(labels))

95394
['B-DATASET' 'I-DATASET' 'O']


# 3. Train the SciBert model

In [6]:
#Import SciBERT models
#SciBERT_tokenizer = AutoTokenizer.from_pretrained(model_name, sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]') #https://huggingface.co/transformers/model_doc/bert.html  
#SciBERT_modelTC = AutoModelForTokenClassification.from_pretrained(model_name)
#SciBERT_modelMLM = AutoModelForMaskedLM.from_pretrained(model_name)

In [7]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./


In [8]:
# def test_training_data(path):
#     with open(path) as f:
#         for row in f:
#             jsonrow = json.loads(row)
#             if(len(jsonrow["tokens"]) > 512):
#                 print("TOO LONG AT: {}".format(jsonrow["id"]))
#                 print(jsonrow)


# test_training_data(path_train_nerjson)

In [9]:
def train_scibert_ner(batch_size):
    os.environ["MODEL_NAME"] = f"{model_name}"
    os.environ["TRAIN_FILE"] = f"{path_train_nerjson}"
    os.environ["VALIDATION_FILE"] = f"{path_train_nerjson}"
    os.environ["BATCH_SIZE"] = f"{batch_size}"
    
    acc = 0
    with open(path_train_nerjson) as f:
        print("open ")
        for row in f:
            acc += 1
    
    print("There are {} training samples!".format(acc))
    
    !python ../input/tdmsci/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_NAME" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --num_train_epochs 4 \
    --per_device_train_batch_size "$BATCH_SIZE" \
    --per_device_eval_batch_size "$BATCH_SIZE" \
    --save_steps 15000 \
    --pad_to_max_length \
    --output_dir './output' \
    --report_to 'none' \
    --seed 123 \
    --do_train
!rm -r "./output"

rm: cannot remove './output': No such file or directory


In [10]:
#Train SciBERT model
batch_size = 8
train_scibert_ner(batch_size)

open 
There are 95394 training samples!
2021-06-07 04:31:03.133087: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-0498abf660b7e026/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...
Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-0498abf660b7e026/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
[INFO|file_utils.py:1402] 2021-06-07 04:31:11,491 >> https://huggingface.co/allenai/scibert_scivocab_cased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp_260w4nh
Downloading: 100%|██████████████████████████████| 385/385 [

# 4. Test the SciBERT model

In [11]:
# max_length = 64 # max no. words for each sentence.
# overlap = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

# pred_save_path = './pred'
# prediction_file = 'test_predictions.txt'
# test_input_save_path = './input_data'
# path_pretrained_scibert = '/kaggle/working/output'
# train_file = path_train_nerjson
# filename_test = 'test_ner_input.json'

# os.environ["MODEL_PATH"] = f"{path_pretrained_scibert}"
# os.environ["TRAIN_FILE"] = f"{train_file}"
# os.environ["VALIDATION_FILE"] = f"{train_file}"
# os.environ["TEST_FILE"] = f"{test_input_save_path}/{filename_test}"
# os.environ["OUTPUT_DIR"] = f"{pred_save_path}"


# os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
# os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [12]:
# paper_length = [] # store the number of sentences each paper has
# def prepare_testdata(filename):
#     test_rows = []
#     with open(filename) as f:
#         for row in f:
#             json_row = json.loads(row)
#             test_rows.append(json_row)
#     return test_rows
    
# test_rows = prepare_testdata(train_file) #test data in NER format

In [13]:
# print(len(test_rows))

In [14]:
# os.makedirs(test_input_save_path, exist_ok=True)

In [15]:
# def predict_scibert_ner():
#     !python ../input/tdmsci/run_ner.py \
#     --model_name_or_path "$MODEL_PATH" \
#     --train_file "$TRAIN_FILE" \
#     --validation_file "$VALIDATION_FILE" \
#     --test_file "$TEST_FILE" \
#     --output_dir "$OUTPUT_DIR" \
#     --report_to 'none' \
#     --seed 123 \
#     --do_predict

In [16]:
# bert_outputs = []
# batch_size = 2000#64000


# for batch_begin in range(0, len(test_rows), batch_size):#len(test_rows), batch_size):
#     # write data rows to input file
#     with open(f"{test_input_save_path}/{filename_test}", 'w') as f:
#         for row in test_rows[batch_begin:batch_begin+batch_size]:
#             print(row["id"])
#             json.dump(row, f)
#             f.write('\n')
            
#     with open(f"{test_input_save_path}/{filename_test}", 'r') as f:
#         content = f.read()
        
#     # remove output dir
#     !rm -r "$OUTPUT_DIR"
    
#     # do predict
#     predict_scibert_ner()
    
#     # read predictions
#     with open(f'{pred_save_path}/{prediction_file}') as f:
#         this_preds = f.read().split('\n')[:-1]
#         bert_outputs += [pred.split() for pred in this_preds]

In [17]:
# print(bert_outputs)


# 5. Restore labels

In [18]:
# def jaccard_similarity(s1, s2):
#     l1 = s1.split(" ")
#     l2 = s2.split(" ")    
#     intersection = len(list(set(l1).intersection(l2)))
#     union = (len(l1) + len(l2)) - intersection
#     return float(intersection) / union

# def filter_bert_labels(dataset_labels, ):
#     for labels in dataset_labels:
#         print(labels)
# #print(len(list(test_df_ner['Sentences'])[0]))
# #filter_bert_labels(bert_outputs)
# #amountSentences = [len(list(test_df['Sentences'])[0]) ]
# #sentences = list(test_df_ner["Sentences"])[0]
# # for i in range(4,5):#(0,len(sentences)):
# test_sentences = [row['tokens'] for row in test_rows]
# test_ids = list(test_df_ner["Id"])
# labels = []
# for length_i in range(len(paper_length)):
#     paper_id = test_ids[length_i]
#     print(paper_id)
#     for sentence, pred in zip(test_sentences[:paper_length[length_i]], bert_outputs[:paper_length[length_i]]):
#         dataset = ""
#         for word, tag in zip(sentence, pred):
#             #print(word)
#             #print(tags)
#             x = 0
#             if(tag == 'B-DATASET'):
#                 dataset += tag + ' '
#             elif(tag == "I-DATASET" and dataset != ""):
#                 dataset += tag + ' '
#             elif(tag != "B-DATASET" and tag != "I-DATASET" and dataset != ""):
#                 labels.append(dataset)
#                 dataset = "" 
                                
#     if(dataset == "" and len(labels) < length_i+1):
#         labels.append("")

#     del test_sentences[:length_i], bert_outputs[:length_i]


# #        print(len(tokens))
# #         print(len(pred))
#     #words = [tup[0] for tup in sentences[i]]
    
# #         print(sentence)
# #         print(pred)
# # for labels in bert_dataset_labels:
# #     filtered = []
    
# #     for label in sorted(labels, key=len):
# #         label = clean_text(label)
# #         if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
# #             filtered.append(label)
    
# #     filtered_bert_labels.append('|'.join(filtered))

# 6. Generate submission file

In [19]:
# def generate_submission_file(test_ids, test_predictions):
#     submission_dict = {"Id": test_ids, "PredictionString": test_predictions}
#     submission_df = pd.DataFrame.from_dict(submission_dict)
#     sample_submission.to_csv(f'submission.csv', index=False)

# #Import extra training data https://github.com/IBM/science-result-extractor/tree/master/data
# # import os
# # import re 

# generate_submission_file(test_ids, labels)