In [1]:
!pip install datasets setfit



In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from setfit import sample_dataset, SetFitModel, Trainer, TrainingArguments
from sentence_transformers.losses import CosineSimilarityLoss
import torch

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    DATA_PATH = '/content/gdrive/MyDrive/CSI5137-project/data/'
except:
    DATA_PATH = 'data/'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


device(type='cuda', index=0)

## Load data

In [3]:
# Load PURE dataset
pure = pd.read_csv(DATA_PATH + 'PURE_train.csv')
tmp = pd.read_csv(DATA_PATH + 'PURE_test.csv')
pure = pd.concat([pure, tmp], axis=0)
tmp = pd.read_csv(DATA_PATH + 'PURE_valid.csv')
pure = pd.concat([pure, tmp], axis=0)

pure['Req/Not Req'] = pure['Req/Not Req'].apply(lambda x: 1 if x == 'Req' else 0)

pure['text'] = pure['Requirement']
pure['label'] = pure['Req/Not Req']
pure = pure.drop(['Unnamed: 0', 'Name of Doc', 'Requirement', 'Req/Not Req'], axis=1)

pure = pure.sample(frac=1).reset_index(drop=True)

print(pure['label'].value_counts())
pure.head(10)

1    4145
0    3600
Name: label, dtype: int64


  and should_run_async(code)


Unnamed: 0,text,label
0,NPAC SMS shall support a NPA Split History Rep...,1
1,A Patron is a customer of King County Library ...,0
2,This eliminates the need for designing a secon...,1
3,Coordinates the removal of an AIP with the mai...,1
4,"In the relational world, they are columns with...",0
5,A2-3B: System changes current directory.,0
6,The system must guide users throughout the int...,1
7,Policy development will be needed regarding in...,0
8,The sending and receiving end points may live ...,1
9,Prompts a request for resubmission to the Cont...,1


In [4]:
# Load dronology dataset
dronology = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/train_fold_1.csv')
tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/test_fold_1.csv')
dronology = pd.concat([dronology, tmp], axis=0)

for i in range(2, 6):
    tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/train_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp], axis=0)
    tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/test_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp], axis=0)

dronology['text'] = dronology['STR.REQ']
dronology['label'] = dronology['class']
dronology = dronology.drop(['issueid', 'STR.REQ', 'class'], axis=1)

# dronology = dronology.drop_duplicates(subset=["text"], keep="first")

print(dronology['label'].value_counts())
dronology.head(10)

0    1400
1     495
Name: label, dtype: int64


  and should_run_async(code)


Unnamed: 0,text,label
0,The MapComponent shall support different types...,1
1,The MissionPlanner shall execute flight plans ...,1
2,The GCS shall transmit the UAV s properties to...,1
3,The GCS shall transmit the UAV s current locat...,1
4,The GCS shall report newly connected UAVs to t...,1
5,When the GCS receives a UAV command from the G...,1
6,When the connection to the GCS from the GCSMid...,1
7,The GCSMiddleware shall forward commands sent ...,1
8,The GCSMiddleware shall handle state messages ...,1
9,The GCSMiddleware shall follow a predefined se...,1


In [5]:
# Merge two datasets
data = pd.concat([pure, dronology], axis=0)
data['label'].value_counts()

  and should_run_async(code)


0    5000
1    4640
Name: label, dtype: int64

In [6]:
data = Dataset.from_pandas(data)
data = data.train_test_split(test_size=0.3)
data = data.remove_columns(['__index_level_0__'])
data

  and should_run_async(code)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6748
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2892
    })
})

## Pre-processing Data

In [7]:
train_dataset = sample_dataset(data["train"], num_samples=24)
eval_dataset = data["test"]
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 48
})

## Training

In [8]:
import evaluate
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

accuracy = evaluate.load("accuracy")

def compute_metrics(predictions, labels):
    #predictions = np.argmax(predictions, axis=1)
    matrics = accuracy.compute(predictions=predictions, references=labels)

    matrics['weighted precision'] = precision_score(labels, predictions, average='weighted')
    matrics['weighted recall'] = recall_score(labels, predictions, average='weighted')
    matrics['weighted f1'] = f1_score(labels, predictions, average='weighted')

    matrics['macro precision'] = precision_score(labels, predictions, average='macro')
    matrics['macro recall'] = recall_score(labels, predictions, average='macro')
    matrics['macro f1'] = f1_score(labels, predictions, average='macro')
    return matrics

  and should_run_async(code)


In [9]:
# model = SetFitModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = SetFitModel.from_pretrained('sentence-transformers/all-roberta-large-v1')

args = TrainingArguments(
    batch_size=8,
    num_epochs=10,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric=compute_metrics,
)
trainer.train()

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

***** Running training *****
  Num examples = 150
  Num epochs = 10
  Total optimization steps = 1500
  Total train batch size = 8


Step,Training Loss


In [10]:
trainer.evaluate()

  and should_run_async(code)
***** Running evaluation *****


{'accuracy': 0.7413554633471646,
 'weighted precision': 0.769921403401218,
 'weighted recall': 0.7413554633471646,
 'weighted f1': 0.7379592824627539,
 'macro precision': 0.7638282097658351,
 'macro recall': 0.7500176490334813,
 'macro f1': 0.7394081119226172}

In [12]:
from huggingface_hub import notebook_login
notebook_login()

  and should_run_async(code)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
!apt install git-lfs
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [16]:
trainer.push_to_hub("kwang123/roberta-large-setfit-ReqORNot")

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

model_head.pkl:   0%|          | 0.00/9.04k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

'https://huggingface.co/kwang123/roberta-large-setfit-ReqORNot/tree/main/'