In [1]:
!pip install -U transformers datasets evaluate accelerate sentencepiece

# Uncomment the following code if you want to upload your model to huggingface
# !apt install git-lfs
# !git config --global user.email "YOUR_EMAIL_ADDRESS"
# !git config --global user.name "YOUR_USER_NAME"

Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
import torch
import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
import evaluate
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    DATA_PATH = '/content/gdrive/MyDrive/CSI5137-project/data/'
except:
    DATA_PATH = 'data/'

model_name = 'microsoft/deberta-v3-large'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


device(type='cuda', index=0)

## Load data

In [2]:
# Load PURE dataset
pure = pd.read_csv(DATA_PATH + 'PURE_train.csv')
tmp = pd.read_csv(DATA_PATH + 'PURE_test.csv')
pure = pd.concat([pure, tmp], axis=0)
tmp = pd.read_csv(DATA_PATH + 'PURE_valid.csv')
pure = pd.concat([pure, tmp], axis=0)

pure['Req/Not Req'] = pure['Req/Not Req'].apply(lambda x: 1 if x == 'Req' else 0)

pure['text'] = pure['Requirement']
pure['label'] = pure['Req/Not Req']
pure = pure.drop(['Unnamed: 0', 'Name of Doc', 'Requirement', 'Req/Not Req'], axis=1)

pure = pure.sample(frac=1).reset_index(drop=True)

print(pure['label'].value_counts())
pure.head(10)

1    4145
0    3600
Name: label, dtype: int64


Unnamed: 0,text,label
0,A2-4B: User chooses to save application (ALT 3).,0
1,The FTSS shall [SRS295] notify the application...,1
2,Each level (department or category) in the bre...,1
3,The system shall allow user to select the supp...,1
4,NPAC SMS shall create a new Subscription Versi...,1
5,The NE shall (3.2.2.3.2) start transmitting a ...,1
6,When a user selects the File/Save As menu item...,1
7,The minimum of a broadband connection is recom...,1
8,Generate xml & schema documents.,0
9,The Clarus system shall be able to receive roa...,1


In [3]:
# Load dronology dataset
dronology = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/train_fold_1.csv')
tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/test_fold_1.csv')
dronology = pd.concat([dronology, tmp], axis=0)

for i in range(2, 6):
    tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/train_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp], axis=0)
    tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/test_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp], axis=0)

dronology['text'] = dronology['STR.REQ']
dronology['label'] = dronology['class']
dronology = dronology.drop(['issueid', 'STR.REQ', 'class'], axis=1)

dronology = dronology.drop_duplicates(subset=["text"], keep="first")

print(dronology['label'].value_counts())
dronology.head(10)

0    280
1     99
Name: label, dtype: int64


Unnamed: 0,text,label
0,The MapComponent shall support different types...,1
1,The MissionPlanner shall execute flight plans ...,1
2,The GCS shall transmit the UAV s properties to...,1
3,The GCS shall transmit the UAV s current locat...,1
4,The GCS shall report newly connected UAVs to t...,1
5,When the GCS receives a UAV command from the G...,1
6,When the connection to the GCS from the GCSMid...,1
7,The GCSMiddleware shall forward commands sent ...,1
8,The GCSMiddleware shall handle state messages ...,1
9,The GCSMiddleware shall follow a predefined se...,1


In [4]:
# Merge two datasets
data = pd.concat([pure, dronology], axis=0)
data['label'].value_counts()

1    4244
0    3880
Name: label, dtype: int64

In [5]:
data = Dataset.from_pandas(data)
data = data.train_test_split(test_size=0.3)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 5686
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2438
    })
})

## Pre-processing

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocessing_func(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_data = data.map(preprocessing_func, batched=True, remove_columns=['text', '__index_level_0__'])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/5686 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2438 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5686
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2438
    })
})

## Training

In [7]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    matrics = accuracy.compute(predictions=predictions, references=labels)

    matrics['weighted precision'] = precision_score(labels, predictions, average='weighted')
    matrics['weighted recall'] = recall_score(labels, predictions, average='weighted')
    matrics['weighted f1'] = f1_score(labels, predictions, average='weighted')

    matrics['macro precision'] = precision_score(labels, predictions, average='macro')
    matrics['macro recall'] = recall_score(labels, predictions, average='macro')
    matrics['macro f1'] = f1_score(labels, predictions, average='macro')
    return matrics

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir='deberta-large-ReqORNot',
    learning_rate=2e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,0.4826,0.428592,0.901969,0.901951,0.901969,0.901942,0.901842,0.901398,0.901601
2,0.3429,0.427395,0.907711,0.909149,0.907711,0.907785,0.907623,0.908929,0.907628
3,0.1299,0.529694,0.913454,0.913456,0.913454,0.91342,0.913467,0.912826,0.913111


Checkpoint destination directory deberta-large-ReqORNot/checkpoint-1896 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory deberta-large-ReqORNot/checkpoint-3792 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory deberta-large-ReqORNot/checkpoint-5688 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=5688, training_loss=0.33325380816238337, metrics={'train_runtime': 1244.2176, 'train_samples_per_second': 13.71, 'train_steps_per_second': 4.572, 'total_flos': 1393818333791928.0, 'train_loss': 0.33325380816238337, 'epoch': 3.0})

In [None]:
# trainer.save_model('deberta-large')
# !mkdir /content/gdrive/MyDrive/CSI5137-project/deberta-large/
# !cp -r deberta-large/ /content/gdrive/MyDrive/CSI5137-project/

In [9]:
# Push fine-tuned model to huggingface_hub
trainer.push_to_hub("deberta-large-ReqORNot")

events.out.tfevents.1709608228.a9253d894a9f.6510.0:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Upload 8 LFS files:   0%|          | 0/8 [00:00<?, ?it/s]

events.out.tfevents.1709606998.a9253d894a9f.695.0:   0%|          | 0.00/9.04k [00:00<?, ?B/s]

events.out.tfevents.1709609478.a9253d894a9f.11767.0:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

events.out.tfevents.1709611575.a9253d894a9f.20454.0:   0%|          | 0.00/17.9k [00:00<?, ?B/s]

events.out.tfevents.1709615340.a9253d894a9f.36168.0:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kwang123/deberta-large-ReqORNot/commit/d215a5a21089771bc6234bf09b20a3741bd865f9', commit_message='kwang123/deberta-large-ReqORNot', commit_description='', oid='d215a5a21089771bc6234bf09b20a3741bd865f9', pr_url=None, pr_revision=None, pr_num=None)