In [12]:
!pip install -U transformers datasets evaluate accelerate trl peft bitsandbytes

# Uncomment the following code if you want to upload your model to huggingface
# !apt install git-lfs
# !git config --global user.email "YOUR_EMAIL_ADDRESS"
# !git config --global user.name "YOUR_USERNAME"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [13]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate
import numpy as np
import bitsandbytes
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

tqdm.pandas()
try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    DATA_PATH = '/content/gdrive/MyDrive/CSI5137-project/data/'
except:
    DATA_PATH = 'data/'

# Huggingface assess token for llama2 model
model_name = 'meta-llama/Llama-2-7b-hf'
# TODO: add your huggingface access token for Llama2 model
token = ''

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


device(type='cuda', index=0)

## Load data

In [3]:
# Load PURE dataset
pure = pd.read_csv(DATA_PATH + 'PURE_train.csv')
tmp = pd.read_csv(DATA_PATH + 'PURE_test.csv')
pure = pd.concat([pure, tmp], axis=0)
tmp = pd.read_csv(DATA_PATH + 'PURE_valid.csv')
pure = pd.concat([pure, tmp], axis=0)

pure['Req/Not Req'] = pure['Req/Not Req'].apply(lambda x: 1 if x == 'Req' else 0)

pure['text'] = pure['Requirement']
pure['label'] = pure['Req/Not Req']
pure = pure.drop(['Unnamed: 0', 'Name of Doc', 'Requirement', 'Req/Not Req'], axis=1)

pure = pure.sample(frac=1).reset_index(drop=True)

print(pure['label'].value_counts())
pure.head(10)

1    4145
0    3600
Name: label, dtype: int64


Unnamed: 0,text,label
0,Relevancy ranking should be manipulable via sy...,1
1,ELSS must retain all logbook reports and any c...,1
2,NPAC SMS shall accept both the old and new NPA...,1
3,The X-38 NE shall (3.2.3.2.1) be able to commu...,1
4,An application shall have associated with it a...,1
5,It is recommended that current system performs...,1
6,Making user interfaces robust: User interfaces...,1
7,Even though memory scrubbing is performed loca...,1
8,NPA SMS shall report an error and reject the N...,1
9,"The name ICCP, which stands for Inter-control ...",0


In [4]:
# Load dronology dataset
dronology = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/train_fold_1.csv')
tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/test_fold_1.csv')
dronology = pd.concat([dronology, tmp], axis=0)

for i in range(2, 6):
    tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/train_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp], axis=0)
    tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/test_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp], axis=0)

dronology['text'] = dronology['STR.REQ']
dronology['label'] = dronology['class']
dronology = dronology.drop(['issueid', 'STR.REQ', 'class'], axis=1)

dronology = dronology.drop_duplicates(subset=["text"], keep="first")

print(dronology['label'].value_counts())
dronology.head(10)

0    280
1     99
Name: label, dtype: int64


Unnamed: 0,text,label
0,The MapComponent shall support different types...,1
1,The MissionPlanner shall execute flight plans ...,1
2,The GCS shall transmit the UAV s properties to...,1
3,The GCS shall transmit the UAV s current locat...,1
4,The GCS shall report newly connected UAVs to t...,1
5,When the GCS receives a UAV command from the G...,1
6,When the connection to the GCS from the GCSMid...,1
7,The GCSMiddleware shall forward commands sent ...,1
8,The GCSMiddleware shall handle state messages ...,1
9,The GCSMiddleware shall follow a predefined se...,1


In [5]:
# Merge two datasets
data = pd.concat([pure, dronology], axis=0)
data['label'].value_counts()

1    4244
0    3880
Name: label, dtype: int64

In [6]:
data = Dataset.from_pandas(data)
data = data.train_test_split(test_size=0.3)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 5686
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2438
    })
})

## Pre-processing data

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
tokenizer.pad_token = tokenizer.eos_token

def preprocessing_func(examples):
    return tokenizer(examples['text'], max_length=4096, truncation=True)

tokenized_data = data.map(preprocessing_func, batched=True, remove_columns=['text', '__index_level_0__'])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_data

Map:   0%|          | 0/5686 [00:00<?, ? examples/s]

Map:   0%|          | 0/2438 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 5686
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2438
    })
})

## Load Model

In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [9]:
def find_all_linear_names(model):
    """
    Find modules to apply LoRA to.

    :param model: PEFT model
    """

    cls = bitsandbytes.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)

In [10]:
def load_model(model_name):
    quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map='auto',
        torch_dtype=torch.bfloat16,
        token=token,
        num_labels=2,
    )

    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

    config = LoraConfig(
        r=64,
        lora_alpha=16,
        target_modules=find_all_linear_names(model),
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"
    )

    model = get_peft_model(model, config)
    print_trainable_parameters(model)

    return model

## Training

In [11]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    matrics = accuracy.compute(predictions=predictions, references=labels)

    matrics['weighted precision'] = precision_score(labels, predictions, average='weighted')
    matrics['weighted recall'] = recall_score(labels, predictions, average='weighted')
    matrics['weighted f1'] = f1_score(labels, predictions, average='weighted')

    matrics['macro precision'] = precision_score(labels, predictions, average='macro')
    matrics['macro recall'] = recall_score(labels, predictions, average='macro')
    matrics['macro f1'] = f1_score(labels, predictions, average='macro')
    return matrics

In [12]:
model = load_model(model_name)

model.config.pad_token_id = model.config.eos_token_id

training_args = TrainingArguments(
    output_dir="llama2-7B-ReqORNot",
    learning_rate=2e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

model.config.use_cache = False

print('Start Training')
trainer.train()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA module names: ['o_proj', 'gate_proj', 'v_proj', 'q_proj', 'up_proj', 'down_proj', 'k_proj']
trainable params: 159916032 || all params: 3529265152 || trainable%: 4.531142464866295
Start Training




Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.480733,0.789582,0.789542,0.789582,0.789523,0.789447,0.789126,0.789248
2,No log,0.316705,0.860541,0.860541,0.860541,0.860541,0.860391,0.860391,0.860391
3,0.510800,0.270947,0.885972,0.886895,0.885972,0.885999,0.886211,0.886577,0.885959
4,0.510800,0.270359,0.888023,0.888876,0.888023,0.887853,0.889417,0.887053,0.887636
5,0.182900,0.259688,0.897047,0.897093,0.897047,0.897058,0.896886,0.897057,0.896959


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


TrainOutput(global_step=1185, training_loss=0.31211050170383375, metrics={'train_runtime': 8654.055, 'train_samples_per_second': 3.285, 'train_steps_per_second': 0.137, 'total_flos': 1.1746512589160448e+17, 'train_loss': 0.31211050170383375, 'epoch': 5.0})

In [13]:
trainer.evaluate()

{'eval_loss': 0.259687602519989,
 'eval_accuracy': 0.8970467596390485,
 'eval_weighted precision': 0.8970933750963284,
 'eval_weighted recall': 0.8970467596390485,
 'eval_weighted f1': 0.8970578371441602,
 'eval_macro precision': 0.8968861952861953,
 'eval_macro recall': 0.8970570501380729,
 'eval_macro f1': 0.8969593704320551,
 'eval_runtime': 217.7675,
 'eval_samples_per_second': 11.195,
 'eval_steps_per_second': 0.468,
 'epoch': 5.0}

In [15]:
# trainer.save_model('llama2')
# !cp -r llama2/ /content/gdrive/MyDrive/CSI5137-project/

In [14]:
trainer.push_to_hub('llama2-7B-ReqORNot')

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

events.out.tfevents.1709692803.1be7571c5275.79128.1:   0%|          | 0.00/754 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/640M [00:00<?, ?B/s]

events.out.tfevents.1709682936.1be7571c5275.79128.0:   0%|          | 0.00/9.18k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kwang123/llama2-7B-ReqORNot/commit/af2eca2743c7523b8c081dcb999239fa96f3c93c', commit_message='kwang123/llama2-7B-ReqORNot', commit_description='', oid='af2eca2743c7523b8c081dcb999239fa96f3c93c', pr_url=None, pr_revision=None, pr_num=None)