In [None]:
!pip install -U transformers datasets evaluate accelerate trl peft bitsandbytes

Collecting transformers
  Downloading transformers-4.36.1-py3-none-any.whl (8.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.7.4-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.9/133.9 kB[0m [31m19.0 MB/s[0m eta [36m

In [None]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate
import numpy as np
import bitsandbytes
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

tqdm.pandas()
try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    DATA_PATH = '/content/gdrive/MyDrive/CSI5137-project/data/'
except:
    DATA_PATH = 'data/'

# Huggingface assess token for llama2 model
model_name = 'meta-llama/Llama-2-7b-hf'
# TODO: add your huggingface access token for Llama2 model
token = ''

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


device(type='cuda', index=0)

## Load data

In [None]:
# Load PURE dataset
pure = pd.read_csv(DATA_PATH + 'PURE_train.csv')
tmp = pd.read_csv(DATA_PATH + 'PURE_test.csv')
pure = pd.concat([pure, tmp], axis=0)
tmp = pd.read_csv(DATA_PATH + 'PURE_valid.csv')
pure = pd.concat([pure, tmp], axis=0)

pure['Req/Not Req'] = pure['Req/Not Req'].apply(lambda x: 1 if x == 'Req' else 0)

pure['text'] = pure['Requirement']
pure['label'] = pure['Req/Not Req']
pure = pure.drop(['Unnamed: 0', 'Name of Doc', 'Requirement', 'Req/Not Req'], axis=1)

pure = pure.sample(frac=1).reset_index(drop=True)

print(pure['label'].value_counts())
pure.head(10)

1    4145
0    3600
Name: label, dtype: int64


Unnamed: 0,text,label
0,NPAC SMS shall support messages containing pri...,1
1,Several requirements deal with “regional” need...,0
2,Security: Users must have antivirus software i...,1
3,Allows NLM staff to display and perform manual...,1
4,− Translators: People of all age groups with v...,0
5,A3-3D: Use case continues with step 4.,0
6,"Given a node N, the well-formed SDT strings of...",0
7,"NPAC SMS shall, upon successful completion of ...",1
8,Use Case Summary: The user wants to perform so...,0
9,NPAC SMS shall default the long Final Concurre...,1


In [None]:
# Load dronology dataset
dronology = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/train_fold_1.csv')
tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_1/test_fold_1.csv')
dronology = pd.concat([dronology, tmp], axis=0)

for i in range(2, 6):
    tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/train_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp], axis=0)
    tmp = pd.read_csv(DATA_PATH + 'dronology_five_folds/fold_{}/test_fold_{}.csv'.format(i, i))
    dronology = pd.concat([dronology, tmp], axis=0)

dronology['text'] = dronology['STR.REQ']
dronology['label'] = dronology['class']
dronology = dronology.drop(['issueid', 'STR.REQ', 'class'], axis=1)

# dronology = dronology.drop_duplicates(subset=["text"], keep="first")

print(dronology['label'].value_counts())
dronology.head(10)

0    1400
1     495
Name: label, dtype: int64


Unnamed: 0,text,label
0,The MapComponent shall support different types...,1
1,The MissionPlanner shall execute flight plans ...,1
2,The GCS shall transmit the UAV s properties to...,1
3,The GCS shall transmit the UAV s current locat...,1
4,The GCS shall report newly connected UAVs to t...,1
5,When the GCS receives a UAV command from the G...,1
6,When the connection to the GCS from the GCSMid...,1
7,The GCSMiddleware shall forward commands sent ...,1
8,The GCSMiddleware shall handle state messages ...,1
9,The GCSMiddleware shall follow a predefined se...,1


In [None]:
# Merge two datasets
data = pd.concat([pure, dronology], axis=0)
data['label'].value_counts()

0    5000
1    4640
Name: label, dtype: int64

In [None]:
data = Dataset.from_pandas(data)
data = data.train_test_split(test_size=0.3)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 6748
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2892
    })
})

## Pre-processing data

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
tokenizer.pad_token = tokenizer.eos_token

def preprocessing_func(examples):
    return tokenizer(examples['text'], max_length=4096, truncation=True)

tokenized_data = data.map(preprocessing_func, batched=True, remove_columns=['text', '__index_level_0__'])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_data

Map:   0%|          | 0/6748 [00:00<?, ? examples/s]

Map:   0%|          | 0/2892 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 6748
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2892
    })
})

## Load Model

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
def find_all_linear_names(model):
    """
    Find modules to apply LoRA to.

    :param model: PEFT model
    """

    cls = bitsandbytes.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)

In [None]:
def load_model(model_name):
    quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map='auto',
        torch_dtype=torch.bfloat16,
        token=token,
        num_labels=2,
    )

    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

    config = LoraConfig(
        r=64,
        lora_alpha=16,
        target_modules=find_all_linear_names(model),
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"
    )

    model = get_peft_model(model, config)
    print_trainable_parameters(model)

    return model

## Training

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    matrics = accuracy.compute(predictions=predictions, references=labels)

    matrics['weighted precision'] = precision_score(labels, predictions, average='weighted')
    matrics['weighted recall'] = recall_score(labels, predictions, average='weighted')
    matrics['weighted f1'] = f1_score(labels, predictions, average='weighted')

    matrics['macro precision'] = precision_score(labels, predictions, average='macro')
    matrics['macro recall'] = recall_score(labels, predictions, average='macro')
    matrics['macro f1'] = f1_score(labels, predictions, average='macro')
    return matrics

In [None]:
model = load_model(model_name)

model.config.pad_token_id = model.config.eos_token_id

training_args = TrainingArguments(
    output_dir="model-output",
    learning_rate=2e-5,
    per_device_train_batch_size=30,
    per_device_eval_batch_size=30,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

model.config.use_cache = False

print('Start Training')
trainer.train()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA module names: ['k_proj', 'gate_proj', 'o_proj', 'down_proj', 'up_proj', 'v_proj', 'q_proj']
trainable params: 159916032 || all params: 3529265152 || trainable%: 4.531142464866295
Start Training


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.433645,0.7787,0.800248,0.7787,0.776039,0.797236,0.783045,0.776783
2,No log,0.280652,0.877593,0.879861,0.877593,0.877198,0.880843,0.875941,0.876855
3,0.440200,0.246894,0.903181,0.90335,0.903181,0.903203,0.902984,0.903355,0.903107
4,0.440200,0.234971,0.911826,0.911837,0.911826,0.911807,0.911877,0.911525,0.911676
5,0.136100,0.238215,0.911134,0.911404,0.911134,0.911067,0.91173,0.910484,0.910904


Checkpoint destination directory model-output/checkpoint-225 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory model-output/checkpoint-450 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory model-output/checkpoint-675 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1125, training_loss=0.26546480475531686, metrics={'train_runtime': 11492.5872, 'train_samples_per_second': 2.936, 'train_steps_per_second': 0.098, 'total_flos': 1.6260004392689664e+17, 'train_loss': 0.26546480475531686, 'epoch': 5.0})

In [None]:
trainer.save_model('llama2')

In [None]:
!cp -r llama2/ /content/gdrive/MyDrive/CSI5137-project/