In [1]:
!pip install -U transformers accelerate evaluate datasets sentencepiece bitsandbytes trl peft setfit



In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
import numpy as np
import evaluate
import bitsandbytes
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import torch
import math
from tqdm.auto import tqdm
import torch.nn as nn
from setfit import SetFitModel

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    DATA_PATH = '/content/gdrive/MyDrive/CSI5137-project/data/'
except:
    DATA_PATH = 'data/'

device = torch.device("mps" if torch.backends.mps.is_built() else "cuda:0" if torch.cuda.is_available() else "cpu")
device

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


device(type='cuda', index=0)

## Load Data

In [3]:
# Load PURE dataset
pure = pd.read_csv(DATA_PATH + 'PURE_test.csv')

pure['Req/Not Req'] = pure['Req/Not Req'].apply(lambda x: 1 if x == 'Req' else 0)

pure['text'] = pure['Requirement']
pure['label'] = pure['Req/Not Req']
pure = pure.drop(['Unnamed: 0', 'Name of Doc', 'Requirement', 'Req/Not Req'], axis=1)

pure = pure.sample(frac=1).reset_index(drop=True)

print(pure['label'].value_counts())
pure.head(10)

1    1058
0     476
Name: label, dtype: int64


  and should_run_async(code)


Unnamed: 0,text,label
0,The Smart House components consist of househol...,0
1,The DigitalHome Software Requirements Specific...,0
2,"If no new NE is observed, the functioning NEs ...",1
3,The FDIR shall (3.3.5.13) be able to discrimin...,1
4,The waveform and amplitude of the shock pulses...,1
5,To support Dynamic Lane control in other cente...,1
6,Maintains configuration management of the syst...,1
7,"For example, consider a monthly crawl of “www....",0
8,The ELSS must provide facilities to transmit d...,1
9,The Center shall support the following informa...,1


In [4]:
dronology = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/test_fold_1.csv')

for i in range(2, 6):
    tmp_test = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/test_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp_test], axis=0)

dronology['text'] = dronology['STR.REQ']
dronology['label'] = dronology['class']
dronology = dronology.drop(['issueid', 'STR.REQ', 'class'], axis=1)

dronology = dronology.drop_duplicates(subset=["text"], keep="first")

print(dronology['label'].value_counts())
dronology.head(10)

0    278
1     99
Name: label, dtype: int64


  and should_run_async(code)


Unnamed: 0,text,label
0,The GCS shall assign a message frequency for a...,1
1,The RealTimeFlightUI shall display all current...,1
2,The RealTimeFlightUI shall display the name an...,1
3,When requested the RouteCreationUI shall send ...,1
4,The ActivityLogger shall log all commands sent...,1
5,When a UAV is deactivated the UAVActivationMan...,1
6,If requested the SingleUAVFlightPlanScheduler ...,1
7,If a flight route is assigned to a UAV which i...,1
8,When given two coordinates the CoordinateSyste...,1
9,The FlightRouteManager shall define flight rou...,1


In [5]:
test = pd.concat([pure, dronology], axis=0)
test['label'].value_counts()

  and should_run_async(code)


1    1157
0     754
Name: label, dtype: int64

In [6]:
X_test = test['text'].to_list()
y_test = test['label'].to_list()

  and should_run_async(code)


## Normalize the F1-score of both models as weights of each model for voting system

In [7]:
llama_macro_f1 = 0.942408
deberta_macro_f1 = 0.939089
few_shot_macro_f1 = 0.739408

deberta_weight = deberta_macro_f1 / (deberta_macro_f1 + llama_macro_f1 + few_shot_macro_f1)
llama_weight = deberta_macro_f1 / (deberta_macro_f1 + llama_macro_f1 + few_shot_macro_f1)
few_shot_weight = few_shot_macro_f1 / (deberta_macro_f1 + llama_macro_f1 + few_shot_macro_f1)

print('Weight of DeBERTa model in the voting system:', deberta_weight)
print('Weight of Llama2 model in the voting system:', llama_weight)
print('Weight of the few-shot model in the voting system:', few_shot_weight)

Weight of DeBERTa model in the voting system: 0.35830714962961263
Weight of Llama2 model in the voting system: 0.35830714962961263
Weight of the few-shot model in the voting system: 0.28211934427230284


## Load Model

In [8]:
deberta_tokenizer = AutoTokenizer.from_pretrained('kwang123/deberta-large-ReqORNot')
deberta_model = AutoModelForSequenceClassification.from_pretrained('kwang123/deberta-large-ReqORNot').to(device)

llama_tokenizer = AutoTokenizer.from_pretrained('kwang123/llama2-7B-ReqORNot')
llama_tokenizer.pad_token = llama_tokenizer.eos_token
# llama_model = Llama_Model().to(device)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
# TODO: add your huggingface access token for Llama2 model
llama_model = AutoModelForSequenceClassification.from_pretrained(
    'kwang123/llama2-7B-ReqORNot',
    quantization_config=quantization_config,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    num_labels=2,
    token=''
)
llama_model.config.pad_token_id = llama_model.config.eos_token_id

few_shot_model = SetFitModel.from_pretrained("kwang123/roberta-large-setfit-ReqORNot").to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Evaluating the voting system

In [9]:
import torch.nn.functional as F

batch_size = 16

accuracy = []
prec, macro_prec = [], []
rec, macro_rec = [], []
f1, macro_f1 = [], []

data_length = len(X_test)

for i in tqdm(range(math.ceil(data_length/batch_size)), desc=f"Evaluating progress"):
    X = X_test[i*batch_size:i*batch_size+batch_size] if i*batch_size+batch_size <= data_length else X_test[i*batch_size:]

    max_length = max([len(each.split(' ')) for each in X])

    deberta_input = deberta_tokenizer(X, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)
    llama_input = llama_tokenizer(X, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)

    labels = y_test[i*batch_size:i*batch_size+batch_size] if i*batch_size+batch_size <= data_length else y_test[i*batch_size:]

    with torch.no_grad():
        deberta_logits = F.softmax(deberta_model(**deberta_input).logits, dim=1)
        llama_logits = F.softmax(llama_model(**llama_input).logits, dim=1)
        few_shot_logits = few_shot_model.predict_proba(X).to(device)

    deberta_logits *= deberta_weight
    llama_logits *= llama_weight
    few_shot_logits *= few_shot_weight

    logits = deberta_logits + llama_logits + few_shot_logits

    y_pred = torch.argmax(logits, 1).cpu()

    accuracy.append(accuracy_score(labels, y_pred))

    prec.append(precision_score(labels, y_pred, average='weighted'))
    rec.append(recall_score(labels, y_pred, average='weighted'))
    f1.append(f1_score(labels, y_pred, average='weighted'))

    macro_prec.append(precision_score(labels, y_pred, average='macro'))
    macro_rec.append(recall_score(labels, y_pred, average='macro'))
    macro_f1.append(f1_score(labels, y_pred, average='macro'))

print('Accuracy: %.6f, Weighted precision: %.6f, Weighted Recall: %.6f, Weighted F1: %.6f, Macro precision: %.6f, Macro Recall: %.6f, Macro F1: %.6f'
      % (np.mean(accuracy), np.mean(prec), np.mean(rec), np.mean(f1), np.mean(macro_prec), np.mean(macro_rec), np.mean(macro_f1)))

Evaluating progress:   0%|          | 0/120 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.945312, Weighted precision: 0.956053, Weighted Recall: 0.945312, Weighted F1: 0.946927, Macro precision: 0.919577, Macro Recall: 0.930135, Macro F1: 0.919556


  and should_run_async(code)
