In [3]:
!pip install datasets setfit

# Uncomment the following code if you want to upload your model to huggingface
# !apt install git-lfs
# !git config --global user.email "YOUR_EMAIL_ADDRESS"
# !git config --global user.name "YOUR_USER_NAME"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict
from setfit import sample_dataset, SetFitModel, Trainer, TrainingArguments
from sentence_transformers.losses import CosineSimilarityLoss
import torch

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    DATA_PATH = '/content/gdrive/MyDrive/CSI5137-project/data/'
except:
    DATA_PATH = 'data/'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

Mounted at /content/gdrive


device(type='cuda', index=0)

## Load data

In [5]:
# Load PURE dataset
pure = pd.read_csv(DATA_PATH + 'PURE_train.csv')
tmp = pd.read_csv(DATA_PATH + 'PURE_test.csv')
pure = pd.concat([pure, tmp], axis=0)
tmp = pd.read_csv(DATA_PATH + 'PURE_valid.csv')
pure = pd.concat([pure, tmp], axis=0)

pure['Req/Not Req'] = pure['Req/Not Req'].apply(lambda x: 1 if x == 'Req' else 0)

pure['text'] = pure['Requirement']
pure['label'] = pure['Req/Not Req']
pure = pure.drop(['Unnamed: 0', 'Name of Doc', 'Requirement', 'Req/Not Req'], axis=1)

pure = pure.sample(frac=1).reset_index(drop=True)

print(pure['label'].value_counts())
pure.head(10)

  and should_run_async(code)


1    4145
0    3600
Name: label, dtype: int64


Unnamed: 0,text,label
0,NPAC SMS shall suppress the broadcast to a Loc...,1
1,The system should ensure high standards of sec...,1
2,"NOTE: If a single LNP Type is selected, then o...",1
3,"Holds are subtotaled by type, e.g. active, fro...",1
4,NPAC SMS shall update the Block Failed SP List...,1
5,User can click on [ view detail ] for more inf...,1
6,The DigitalHome security system consists of co...,1
7,The system should be developed on Open Standards,1
8,"OE-4: If web-browser based, System Administrat...",0
9,The list is by no mean exhaustive.,0


In [6]:
# Load dronology dataset
dronology = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/train_fold_1.csv')
tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/test_fold_1.csv')
dronology = pd.concat([dronology, tmp], axis=0)

for i in range(2, 6):
    tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/train_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp], axis=0)
    tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/test_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp], axis=0)

dronology['text'] = dronology['STR.REQ']
dronology['label'] = dronology['class']
dronology = dronology.drop(['issueid', 'STR.REQ', 'class'], axis=1)

dronology = dronology.drop_duplicates(subset=["text"], keep="first")

print(dronology['label'].value_counts())
dronology.head(10)

  and should_run_async(code)


0    280
1     99
Name: label, dtype: int64


Unnamed: 0,text,label
0,The MapComponent shall support different types...,1
1,The MissionPlanner shall execute flight plans ...,1
2,The GCS shall transmit the UAV s properties to...,1
3,The GCS shall transmit the UAV s current locat...,1
4,The GCS shall report newly connected UAVs to t...,1
5,When the GCS receives a UAV command from the G...,1
6,When the connection to the GCS from the GCSMid...,1
7,The GCSMiddleware shall forward commands sent ...,1
8,The GCSMiddleware shall handle state messages ...,1
9,The GCSMiddleware shall follow a predefined se...,1


In [7]:
# Merge two datasets
data = pd.concat([pure, dronology], axis=0)
data['label'].value_counts()

  and should_run_async(code)


1    4244
0    3880
Name: label, dtype: int64

In [8]:
data = Dataset.from_pandas(data)
data = data.train_test_split(test_size=0.3)
data = data.remove_columns(['__index_level_0__'])
data

  and should_run_async(code)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5686
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2438
    })
})

## Pre-processing Data

In [9]:
train_dataset = sample_dataset(data["train"], num_samples=24)
eval_dataset = data["test"]
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 48
})

## Training

In [10]:
import evaluate
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

accuracy = evaluate.load("accuracy")

def compute_metrics(predictions, labels):
    #predictions = np.argmax(predictions, axis=1)
    matrics = accuracy.compute(predictions=predictions, references=labels)

    matrics['weighted precision'] = precision_score(labels, predictions, average='weighted')
    matrics['weighted recall'] = recall_score(labels, predictions, average='weighted')
    matrics['weighted f1'] = f1_score(labels, predictions, average='weighted')

    matrics['macro precision'] = precision_score(labels, predictions, average='macro')
    matrics['macro recall'] = recall_score(labels, predictions, average='macro')
    matrics['macro f1'] = f1_score(labels, predictions, average='macro')
    return matrics

  and should_run_async(code)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [11]:
# model = SetFitModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = SetFitModel.from_pretrained('sentence-transformers/all-roberta-large-v1')

args = TrainingArguments(
    batch_size=8,
    num_epochs=10,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric=compute_metrics,
)
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 1200
  Batch size = 8
  Num epochs = 10
  Total optimization steps = 1500


Step,Training Loss


In [12]:
trainer.evaluate()

  and should_run_async(code)
***** Running evaluation *****


{'accuracy': 0.7621000820344545,
 'weighted precision': 0.7627752679232598,
 'weighted recall': 0.7621000820344545,
 'weighted f1': 0.7621663772102192,
 'macro precision': 0.7621734718049769,
 'macro recall': 0.7624659767698817,
 'macro f1': 0.7620481988534211}

In [13]:
trainer.push_to_hub("roberta-large-setfit-ReqORNot")

  and should_run_async(code)


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model_head.pkl:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kwang123/roberta-large-setfit-ReqORNot/commit/858eab2259760064ec1b5247d493d46a53c5c5f6', commit_message='Add SetFit model', commit_description='', oid='858eab2259760064ec1b5247d493d46a53c5c5f6', pr_url=None, pr_revision=None, pr_num=None)