In [1]:
 # dependencies
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import transformers
from sklearn import metrics
import matplotlib.pyplot as plt
from math import ceil
import re
import wandb
from transformers import AutoModelForSequenceClassification, AutoModel

In [2]:
torch_device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [3]:
tokenizer = transformers.BertTokenizer.from_pretrained("allenai/scibert_scivocab_cased")

In [4]:
model = AutoModelForSequenceClassification.from_pretrained("Models/SciBertFull")

In [5]:
class CustomDataset(Dataset):
# to create training and validation dataset
# input: (BERT) tokenizer, dataframe, max_length
# output: tokenized outputs (ids, attention_mask, token_type_ids) and tags used for BERT training

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.label
        #self.targets = self.data.llist
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        inputs = self.tokenizer.encode_plus(
            self.text[index],
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            #pad_to_max_length=True,
            padding = "max_length",
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [6]:
NON_SCI_PATH = "/media/nvme3n1/proj_scisen/datasets/ScifiSen9.txt"

In [7]:
import random

def insert_token(sentence): 
    x = random.randint(0,99)
    if x % 2 == 0:
        sentence_arr = sentence.split(" ") 
        sentence_arr.insert(random.randint(0, len(sentence_arr)), "<equation>")
        return " ".join(sentence_arr)
    else:
        sentence_arr = sentence.split(" ") 
        sentence_arr.insert(random.randint(0, len(sentence_arr)), "<reference>")
        return " ".join(sentence_arr)            

In [8]:
eval_dict= dict()
sentences = []
modified_sentences = []
with open(NON_SCI_PATH) as f:
    for line in f:
        sentences.append(line)
        modified_sentences.append(insert_token(line))
    eval_dict[0] = sentences
    eval_dict[1] = modified_sentences

In [9]:
def label(sci_sentences):
    input_list : list() = []

    for i in tqdm(range(0,int(len(sci_sentences)))):
        input_list.append({**{'text': sci_sentences[i].rstrip("\n")}, 'label':0.1})
    return input_list

In [10]:
labeled = dict()
for rank in eval_dict:
    labeled[rank] = label(eval_dict[rank])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100001/100001 [00:00<00:00, 1176570.93it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100001/100001 [00:00<00:00, 1262596.61it/s]


In [11]:
eval_df = dict()
for rank in labeled: 
    eval_df[rank] = pd.DataFrame(labeled[rank])
    
testing_sets = dict()
for rank in eval_df:
    testing_sets[rank] = CustomDataset(eval_df[rank], tokenizer, 512)
    
test_params = {'batch_size': 4,
                'shuffle': True,
                'num_workers': 0
                }


testing_loaders = dict()
for rank in eval_df:
    testing_loaders[rank] = DataLoader(testing_sets[rank], **test_params)

In [12]:
model.to(torch_device)

eval_loaders = testing_loaders
outputs_by_rank = dict()
targets_by_rank = dict()

for evalset in eval_loaders:
    print(f"--------------- {evalset} ---------------")
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in enumerate(tqdm(eval_loaders[evalset], 0)):
            ids = data['ids'].to(torch_device, dtype=torch.long)
            mask = data['mask'].to(torch_device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(torch_device, dtype = torch.long)
            targets = data['targets'].to(torch_device, dtype=torch.float)
            outputs = model(ids, mask,token_type_ids)
            fin_outputs = fin_outputs + [out for out in outputs[0].detach().cpu().numpy()]
    outputs_by_rank[evalset] = fin_outputs
            #fin_targets.extend(targets.cpu().detach().numpy().tolist())
            #fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

    #outputs = fin_outputs
    #targets = fin_targets  

--------------- 0 ---------------


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25001/25001 [05:34<00:00, 74.79it/s]


--------------- 1 ---------------


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25001/25001 [05:37<00:00, 74.12it/s]


In [17]:
import pickle
class Analysis(object):
    def __init__(self, values, average, sd):
        self.values = values
        self.average = average
        self.sd = sd
        
    def __enter__(self):
        return __reps__(self)

    def __exit__(self, type, value, tb):
        self.stream.close()
        
    def __repr__(self):
        return "avg: " + str(self.average) + "\nStandDev: " + str(self.sd) + "\nValues: " + str(self.values)
    

In [13]:
scores = dict()

for rank in outputs_by_rank:
    score_list : [int] = list()
    for score in outputs_by_rank[rank]:
        score_list = score_list + [score[0]]
    scores[rank] = score_list
    

In [14]:
def average(values):
    averageScore = 0.0
    for score in values:
        averageScore = averageScore + score
    averageScore = averageScore / len(values)
    return averageScore

In [15]:
import math
def standDev(values, averageScore):
    sd = 0.0
    for score in values:
        sd = sd + abs(averageScore - score)**2 
    sd = sd/len(values)
    sd = math.sqrt(sd)
    return sd

    

In [16]:

def export(values, avg, sd):
    with open("SentenceEvals/SciBert-token-"+str(rank), "w") as target: 
        target.write("Mean :\n" + str(avg) +"\n")
        target.write("Standard Deviation:\n" + str(sd) + "\n\n")
        target.write("Values:\n" + str(values) + "\n\n\n")


In [18]:
for rank in scores:
    values = sorted(scores[rank])
    averageScore = average(values)
    deviation = standDev(values, averageScore)
    result = Analysis(values, averageScore, deviation) 
    file = open("Pickled_Object/SciBert-token" + str(rank), "wb")
    pickle.dump(result, file)
    export(values, averageScore, deviation)