# Load Dependencies


In [1]:
 # dependencies
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import transformers
from sklearn import metrics
import matplotlib.pyplot as plt
from math import ceil
import re
import wandb
from transformers import AutoModelForSequenceClassification, AutoModel


In [2]:
torch_device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# Load Tokenizer and Model

In [4]:
tokenizer = transformers.BertTokenizer.from_pretrained("allenai/scibert_scivocab_cased")
#tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("Models/BertFull")

# Create Dataset for Collate Function

In [6]:
class CustomDataset(Dataset):
# to create training and validation dataset
# input: (BERT) tokenizer, dataframe, max_length
# output: tokenized outputs (ids, attention_mask, token_type_ids) and tags used for BERT training

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.label
        #self.targets = self.data.llist
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        inputs = self.tokenizer.encode_plus(
            self.text[index],
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            #pad_to_max_length=True,
            padding = "max_length",
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

# Load and Prepare Data

In [7]:
SCI_PATHS : list() = ['/media/nvme3n1/proj_scisen/datasets/SciSen-as.txt',
                        '/media/nvme3n1/proj_scisen/datasets/SciSen-a.txt',
                        '/media/nvme3n1/proj_scisen/datasets/SciSen-b.txt',
                        '/media/nvme3n1/proj_scisen/datasets/SciSen-c.txt']
NON_SCI_PATHS: list() = ['/media/nvme3n1/proj_scisen/datasets/ScifiSen1.txt',
                          '/media/nvme3n1/proj_scisen/datasets/RedditSen.txt',
                          "/media/nvme3n1/proj_scisen/datasets/ukraineTweets.txt",
                          "/media/nvme3n1/proj_scisen/datasets/ScifiSen9.txt",
                          '/media/nvme3n1/proj_scisen/datasets/ScifiSen2.txt']

## Read Sentences

For the evaluation of the Model, we take a 10% cut of all the Scientific Sentences. These do not overlap with the Training data.
Additionally, we also load non scientific Sentences which have not been used for training.

In [8]:
sci_sentences : list() = []

eval_dict= {0 : None,    #AS
            1 : None,    #A
            2 : None,    #B
            3 : None,    #C
            4 : None}    #Non-Scientific

rank_sentences : list() = []

for paths in range(0, len(SCI_PATHS)):                 #Read Scientific Sentences
    with open(SCI_PATHS[paths]) as f:
        for line in f:
            rank_sentences.append(line)
    eval_dict[paths] = rank_sentences[0:int(len(rank_sentences)*0.1)]
    rank_sentences=[]

    
sentences = []
with open(NON_SCI_PATHS[3]) as f:                     #Read non Scientific Sentences
    for line in f:
        sentences.append(line)
    eval_dict[4] = sentences

## Label Data
 - Scientific Sentences -> 0.9
 - non-Scientific Sentences -> 0.1

In [33]:

def sci_label(sentences):
    input_list : list() = []
    for i in tqdm(range(0,int(len(sentences)))):
            input_list.append({**{'text': sentences[i].rstrip("\n")}, 'label':0.9})
    return input_list

def non_sci_label(sentence):
    input_list : list() = []
    for i in tqdm(range(0,int(len(sentences)))):
            input_list.append({**{'text': sentences[i].rstrip("\n")}, 'label':0.1})
    return input_list

In [34]:
labeled = dict()
for rank in eval_dict:
    if(rank == 4): 
        labeled[rank] = non_sci_label(eval_dict[rank])
    else:
        labeled[rank] = label(eval_dict[rank])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 284762/284762 [00:00<00:00, 1678858.28it/s]


<class 'list'>


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55314/55314 [00:00<00:00, 1564157.97it/s]


<class 'list'>


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7382/7382 [00:00<00:00, 1300939.17it/s]


<class 'list'>


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3006/3006 [00:00<00:00, 1130465.15it/s]


<class 'list'>


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100001/100001 [00:00<00:00, 1323705.42it/s]


## Create Dataset

In [11]:
eval_df = dict()
for rank in labeled: 
    eval_df[rank] = pd.DataFrame(labeled[rank])

In [12]:
testing_sets = dict()
for rank in eval_df:
    testing_sets[rank] = CustomDataset(eval_df[rank], tokenizer, 512)

In [13]:
test_params = {'batch_size': 4,
                'shuffle': True,
                'num_workers': 0
                }


testing_loaders = dict()
for rank in eval_df:
    testing_loaders[rank] = DataLoader(testing_sets[rank], **test_params)

# Get Model Output


In [14]:
from torch.nn.functional import mse_loss

model.to(torch_device)

eval_loaders = testing_loaders
outputs_by_rank = dict()
targets_by_rank = dict()

for evalset in eval_loaders:
    print(f"--------------- {evalset} ---------------")
    model.eval()
    losses = []
    fin_targets = torch.empty((0)).to(torch_device)
    fin_outputs = torch.empty((0)).to(torch_device)
    with torch.no_grad():
        for _, data in enumerate(tqdm(eval_loaders[evalset], 0)):
            ids = data['ids'].to(torch_device, dtype=torch.long)
            mask = data['mask'].to(torch_device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(torch_device, dtype = torch.long)
            targets = data['targets'].to(torch_device, dtype=torch.float)
            outputs = model(ids, mask,token_type_ids)
            out = [out.detach().cpu().numpy()[0] for out in outputs[0]]
            out = torch.Tensor(out).to(torch_device)
            fin_targets = torch.cat([fin_targets, targets], dim=0)
            fin_outputs = torch.cat([fin_outputs, out], dim=0)
    print(fin_outputs)
    print(fin_targets)
    loss = mse_loss(fin_outputs, fin_targets)
    print(loss)
    outputs_by_rank[evalset] = [out for out in fin_outputs.cpu().numpy()]
            #fin_targets.extend(targets.cpu().detach().numpy().tolist())
            #fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
#percent, the SciBERT model
    #outputs = fin_outputs
    #targets = fin_targets  

--------------- 0 ---------------


100%|█████████████████████████████████████| 71191/71191 [15:48<00:00, 75.07it/s]


tensor([0.1041, 0.1005, 0.1074,  ..., 0.1780, 0.1124, 0.1009], device='cuda:0')
tensor([0.9000, 0.9000, 0.9000,  ..., 0.9000, 0.9000, 0.9000], device='cuda:0')
tensor(0.6181, device='cuda:0')
--------------- 1 ---------------


100%|█████████████████████████████████████| 13829/13829 [03:04<00:00, 75.14it/s]


tensor([0.1080, 0.1608, 0.1041,  ..., 0.1016, 0.1488, 0.1151], device='cuda:0')
tensor([0.9000, 0.9000, 0.9000,  ..., 0.9000, 0.9000, 0.9000], device='cuda:0')
tensor(0.6204, device='cuda:0')
--------------- 2 ---------------


100%|███████████████████████████████████████| 1846/1846 [00:24<00:00, 74.95it/s]


tensor([0.1098, 0.1019, 0.1181,  ..., 0.1021, 0.1030, 0.1130], device='cuda:0')
tensor([0.9000, 0.9000, 0.9000,  ..., 0.9000, 0.9000, 0.9000], device='cuda:0')
tensor(0.6199, device='cuda:0')
--------------- 3 ---------------


100%|█████████████████████████████████████████| 752/752 [00:09<00:00, 75.70it/s]


tensor([0.0999, 0.1017, 0.1323,  ..., 0.1104, 0.1068, 0.1017], device='cuda:0')
tensor([0.9000, 0.9000, 0.9000,  ..., 0.9000, 0.9000, 0.9000], device='cuda:0')
tensor(0.6198, device='cuda:0')
--------------- 4 ---------------


100%|█████████████████████████████████████| 25001/25001 [05:21<00:00, 77.87it/s]

tensor([0.1029, 0.1021, 0.8338,  ..., 0.0974, 0.1012, 0.0991], device='cuda:0')
tensor([0.9000, 0.9000, 0.9000,  ..., 0.9000, 0.9000, 0.9000], device='cuda:0')
tensor(0.6036, device='cuda:0')





# Run Analysis

Calculate average Score and the Standard deviation and export them to text file and as pickled object

In [19]:
import pickle
class Analysis(object):
    def __init__(self, values, average, sd):
        self.values = values
        self.average = average
        self.sd = sd
        
    def __enter__(self):
        return __reps__(self)

    def __exit__(self, type, value, tb):
        self.stream.close()
        
    def __repr__(self):
        return "avg: " + str(self.average) + "\nStandDev: " + str(self.sd) + "\nValues: " + str(self.values)
    


In [15]:
scores = dict()

for rank in outputs_by_rank:
    score_list : [int] = list()
    for score in outputs_by_rank[rank]:
        score_list = score_list + [score]
    scores[rank] = score_list
    

In [16]:
def average(values):
    averageScore = 0.0
    for score in values:
        averageScore = averageScore + score
    averageScore = averageScore / len(values)
    return averageScore

In [17]:
import math
def standDev(values, averageScore):
    sd = 0.0
    for score in values:
        sd = sd + abs(averageScore - score)**2 
    sd = sd/len(values)
    sd = math.sqrt(sd)
    return sd

    

In [18]:

def export(values, avg, sd):
    with open("SentenceEvals/SciBert"+"-"+str(rank), "w") as target: 
        target.write("Mean :\n" + str(avg) +"\n")
        target.write("Standard Deviation:\n" + str(sd) + "\n\n")
        target.write("Values:\n" + str(values) + "\n\n\n")


In [20]:
for rank in scores:
    values = sorted(scores[rank])
    averageScore = average(values)
    deviation = standDev(values, averageScore)
    result = Analysis(values, averageScore, deviation) 
    file = open("Pickled_Object/SciBert-" + str(rank), "wb")
    pickle.dump(result, file)
    export(values, averageScore, deviation)
