In [59]:
 # dependencies
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import transformers
from sklearn import metrics
import matplotlib.pyplot as plt
from math import ceil
import re
import wandb
from transformers import AutoModelForSequenceClassification, AutoModel


In [60]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

In [61]:
model = AutoModelForSequenceClassification.from_pretrained("Models/BertFull")

OSError: Models/BertFull is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [46]:
class CustomDataset(Dataset):
# to create training and validation dataset
# input: (BERT) tokenizer, dataframe, max_length
# output: tokenized outputs (ids, attention_mask, token_type_ids) and tags used for BERT training

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataf
        self.targets = dataframe.label
        #self.targets = self.data.llist
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        inputs = self.tokenizer.encode_plus(
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            #pad_to_max_length=True,
            padding = "max_length",
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [47]:
SCI_PATHS : list() = ['/media/nvme3n1/proj_scisen/datasets/SciSen-as.txt',
                        '/media/nvme3n1/proj_scisen/datasets/SciSen-a.txt',
                        '/media/nvme3n1/proj_scisen/datasets/SciSen-b.txt',
                        '/media/nvme3n1/proj_scisen/datasets/SciSen-c.txt']
NON_SCI_PATHS: list() = ['/media/nvme3n1/proj_scisen/datasets/ScifiSen1.txt',
                          '/media/nvme3n1/proj_scisen/datasets/RedditSen.txt',
                          "/media/nvme3n1/proj_scisen/datasets/ukraineTweets.txt",
                          '/media/nvme3n1/proj_scisen/datasets/ScifiSen2.txt']

In [48]:
sci_sentences : list() = []

eval_dict= {0 : None,
           1 : None,
           2 : None,
           3 : None}
rank_sentences : list() = []

test_size = 500
for paths in range(0, len(SCI_PATHS)):
    with open(SCI_PATHS[paths]) as f:
        for line in f:
            rank_sentences.append(line)
    sci_sentences = sci_sentences + (rank_sentences[0:int(len(rank_sentences)*0.9)])
    eval_dict[paths] = rank_sentences[int(len(rank_sentences)*0.9):len(rank_sentences)]
    rank_sentences=[]

In [49]:

def label(sci_sentences):
    input_list : list() = []

    for i in tqdm(range(0,int(len(sci_sentences)))):
        input_list.append({**{'text': sci_sentences[i].rstrip("\n")}, 'label':0.9})
    print(list)
    return input_list

In [50]:
labeled = dict()
for rank in eval_dict:
    labeled[rank] = label(eval_dict[rank])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 284763/284763 [00:00<00:00, 789773.65it/s]


<class 'list'>


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55315/55315 [00:00<00:00, 1414466.85it/s]


<class 'list'>


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7383/7383 [00:00<00:00, 1472073.89it/s]


<class 'list'>


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3007/3007 [00:00<00:00, 1043198.69it/s]

<class 'list'>





In [51]:
eval_df = dict()
for rank in labeled: 
    eval_df[rank] = pd.DataFrame(labeled[rank])

In [52]:
eval_df

{0:                                                      text  label
 0       If a pixel lies in the visible-region bounding...    0.9
 1       Similarly, a pixel outside this region is a ba...    0.9
 2       This labelling process creates a coarse-level ...    0.9
 3       Importantly, such weakly labelled annotations ...    0.9
 4       Description of MGA branch finishes here and th...    0.9
 ...                                                   ...    ...
 284758  Similarly, <equation> counts triangles whose f...    0.9
 284759        Finally, we return the estimate <equation>.    0.9
 284760  Note that if the first two edges in any triang...    0.9
 284761  Therefore, we divide <equation> by <equation> ...    0.9
 284762  In this section, we verify the runtimes of our...    0.9
 
 [284763 rows x 2 columns],
 1:                                                     text  label
 0      As each MD is a local explanation, we call <eq...    0.9
 1      Framework We focus on the classifi

In [53]:
testing_sets = dict()
for rank in eval_df:
    testing_sets[rank] = CustomDataset(eval_df[rank], tokenizer, 512)

In [55]:
test_params = {'batch_size': 4,
                'shuffle': True,
                'num_workers': 0
                }


testing_loaders = dict()
for rank in eval_df:
    testing_loaders[rank] = DataLoader(testing_sets[rank], **test_params)
    

In [57]:

model.load_state_dict(torch.load("Models/BertFull"))

if validate:
    eval_loaders = {"validation set": validation_loader}
else:
    eval_loaders = testing_loaders
    outputs_by_rank = {}
    targets_by_rank = {}

for evalset in eval_loaders:
    print(f"--------------- {evalset} ---------------")
    if (WANDB and len(eval_loaders)>1):
        wandb.init(project=wandb_project, resume = RESUME, name = wandb_id+"_"+re.sub('[\W_]+', '', evalset), config={"epochs": EPOCHS, "context_width": context_width, "validation": validate, "batch_size": TRAIN_BATCH_SIZE, "learning_rate": LEARNING_RATE, "lambdas": str(LAMBDAS), "trainingdata":"full", "conferences": re.sub('[\W_]+', '', evalset)})
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in enumerate(tqdm(eval_loaders[evalset], 0)):
            ids = data['ids'].to(torch_device, dtype=torch.long)
            mask = data['mask'].to(torch_device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(torch_device, dtype = torch.long)
            targets = data['targets'].to(torch_device, dtype=torch.float)
            outputs = model(ids, mask,token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

    outputs = fin_outputs
    targets = fin_targets    

NameError: name 'model' is not defined