In [1]:
!pip install -U transformers accelerate evaluate datasets sentencepiece bitsandbytes trl peft setfit

Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
import numpy as np
import evaluate
import bitsandbytes
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import torch
import math
from tqdm.auto import tqdm
import torch.nn as nn
from setfit import SetFitModel

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    DATA_PATH = '/content/gdrive/MyDrive/CSI5137-project/data/'
except:
    DATA_PATH = 'data/'

device = torch.device("mps" if torch.backends.mps.is_built() else "cuda:0" if torch.cuda.is_available() else "cpu")
device

Mounted at /content/gdrive


device(type='cuda', index=0)

## Load Data

In [3]:
# Load PURE dataset
pure = pd.read_csv(DATA_PATH + 'PURE_test.csv')

pure['Req/Not Req'] = pure['Req/Not Req'].apply(lambda x: 1 if x == 'Req' else 0)

pure['text'] = pure['Requirement']
pure['label'] = pure['Req/Not Req']
pure = pure.drop(['Unnamed: 0', 'Name of Doc', 'Requirement', 'Req/Not Req'], axis=1)

pure = pure.sample(frac=1).reset_index(drop=True)

print(pure['label'].value_counts())
pure.head(10)

  and should_run_async(code)


1    1058
0     476
Name: label, dtype: int64


Unnamed: 0,text,label
0,Transmission problems allow rejection of entir...,1
1,The ELSS must provide facilities to transmit d...,1
2,Allows for a definition of the audience for wh...,1
3,The subject header must consist of a character...,1
4,The resolution of the timestamp shall (3.2.2.7...,1
5,Prepared by the NLM Digital Repository Working...,0
6,Display interface is web-accessible.,1
7,The Library will follow the HHS/NIH/OCCS secur...,0
8,NOAA - National Oceanic and Atmospheric Admini...,0
9,Identifiers are the link between Data Manageme...,1


In [4]:
dronology = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/test_fold_1.csv')

for i in range(2, 6):
    tmp_test = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/test_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp_test], axis=0)

dronology['text'] = dronology['STR.REQ']
dronology['label'] = dronology['class']
dronology = dronology.drop(['issueid', 'STR.REQ', 'class'], axis=1)

dronology = dronology.drop_duplicates(subset=["text"], keep="first")

print(dronology['label'].value_counts())
dronology.head(10)

  and should_run_async(code)


0    278
1     99
Name: label, dtype: int64


Unnamed: 0,text,label
0,The GCS shall assign a message frequency for a...,1
1,The RealTimeFlightUI shall display all current...,1
2,The RealTimeFlightUI shall display the name an...,1
3,When requested the RouteCreationUI shall send ...,1
4,The ActivityLogger shall log all commands sent...,1
5,When a UAV is deactivated the UAVActivationMan...,1
6,If requested the SingleUAVFlightPlanScheduler ...,1
7,If a flight route is assigned to a UAV which i...,1
8,When given two coordinates the CoordinateSyste...,1
9,The FlightRouteManager shall define flight rou...,1


In [5]:
test = pd.concat([pure, dronology], axis=0)
test['label'].value_counts()

  and should_run_async(code)


1    1157
0     754
Name: label, dtype: int64

In [6]:
X_test = test['text'].to_list()
y_test = test['label'].to_list()

  and should_run_async(code)


## Normalize the F1-score of both models as weights of each model for voting system

In [7]:
llama_macro_f1 = 0.8970
deberta_macro_f1 = 0.9131
few_shot_macro_f1 = 0.7620

deberta_weight = deberta_macro_f1 / (deberta_macro_f1 + llama_macro_f1 + few_shot_macro_f1)
llama_weight = deberta_macro_f1 / (deberta_macro_f1 + llama_macro_f1 + few_shot_macro_f1)
few_shot_weight = few_shot_macro_f1 / (deberta_macro_f1 + llama_macro_f1 + few_shot_macro_f1)

print('Weight of DeBERTa model in the voting system:', deberta_weight)
print('Weight of Llama2 model in the voting system:', llama_weight)
print('Weight of the few-shot model in the voting system:', few_shot_weight)

Weight of DeBERTa model in the voting system: 0.35500174954317487
Weight of Llama2 model in the voting system: 0.35500174954317487
Weight of the few-shot model in the voting system: 0.2962559776058474


## Load Model

In [8]:
deberta_tokenizer = AutoTokenizer.from_pretrained('kwang123/deberta-large-ReqORNot')
deberta_model = AutoModelForSequenceClassification.from_pretrained('kwang123/deberta-large-ReqORNot').to(device)

llama_tokenizer = AutoTokenizer.from_pretrained('kwang123/llama2-7B-ReqORNot')
llama_tokenizer.pad_token = llama_tokenizer.eos_token
# llama_model = Llama_Model().to(device)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
# TODO: add your huggingface access token for Llama2 model
llama_model = AutoModelForSequenceClassification.from_pretrained(
    'kwang123/llama2-7B-ReqORNot',
    quantization_config=quantization_config,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    num_labels=2,
    token=''
)
llama_model.config.pad_token_id = llama_model.config.eos_token_id

few_shot_model = SetFitModel.from_pretrained("kwang123/roberta-large-setfit-ReqORNot").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/920 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/640M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config_setfit.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

model_head.pkl:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

## Evaluating the voting system

In [9]:
import torch.nn.functional as F

batch_size = 16

accuracy = []
prec, macro_prec = [], []
rec, macro_rec = [], []
f1, macro_f1 = [], []

data_length = len(X_test)

for i in tqdm(range(math.ceil(data_length/batch_size)), desc=f"Evaluating progress"):
    X = X_test[i*batch_size:i*batch_size+batch_size] if i*batch_size+batch_size <= data_length else X_test[i*batch_size:]

    max_length = max([len(each.split(' ')) for each in X])

    deberta_input = deberta_tokenizer(X, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)
    llama_input = llama_tokenizer(X, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)

    labels = y_test[i*batch_size:i*batch_size+batch_size] if i*batch_size+batch_size <= data_length else y_test[i*batch_size:]

    with torch.no_grad():
        deberta_logits = F.softmax(deberta_model(**deberta_input).logits, dim=1)
        llama_logits = F.softmax(llama_model(**llama_input).logits, dim=1)
        few_shot_logits = few_shot_model.predict_proba(X).to(device)

    deberta_logits *= deberta_weight
    llama_logits *= llama_weight
    few_shot_logits *= few_shot_weight

    logits = deberta_logits + llama_logits + few_shot_logits

    y_pred = torch.argmax(logits, 1).cpu()

    accuracy.append(accuracy_score(labels, y_pred))

    prec.append(precision_score(labels, y_pred, average='weighted'))
    rec.append(recall_score(labels, y_pred, average='weighted'))
    f1.append(f1_score(labels, y_pred, average='weighted'))

    macro_prec.append(precision_score(labels, y_pred, average='macro'))
    macro_rec.append(recall_score(labels, y_pred, average='macro'))
    macro_f1.append(f1_score(labels, y_pred, average='macro'))

print('Accuracy: %.6f, Weighted precision: %.6f, Weighted Recall: %.6f, Weighted F1: %.6f, Macro precision: %.6f, Macro Recall: %.6f, Macro F1: %.6f'
      % (np.mean(accuracy), np.mean(prec), np.mean(rec), np.mean(f1), np.mean(macro_prec), np.mean(macro_rec), np.mean(macro_f1)))

Evaluating progress:   0%|          | 0/120 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Accuracy: 0.928646, Weighted precision: 0.963730, Weighted Recall: 0.928646, Weighted F1: 0.939845, Macro precision: 0.873558, Macro Recall: 0.886679, Macro F1: 0.870652


  and should_run_async(code)
