# Cross-Validation

Perform cross-validation on all the models with the public dronology dataset

In [None]:
!pip install -U transformers datasets evaluate accelerate sentencepiece setfit trl peft bitsandbytes

# Uncomment the following code if you want to upload your model to huggingface
# !apt install git-lfs
# !git config --global user.email "YOUR_EMAIL_ADDRESS"
# !git config --global user.name "YOUR_USER_NAME"
# from huggingface_hub import notebook_login
# notebook_login()

Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
import os
import numpy as np

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    DATA_PATH = '/content/gdrive/MyDrive/Automated-Requirements-Classification/data/'
except:
    DATA_PATH = 'data/'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

Mounted at /content/gdrive


device(type='cuda', index=0)

In [None]:
!cp /content/gdrive/MyDrive/Automated-Requirements-Classification/models.py /content/

## Load data

In [None]:
def read_data(train_path: str, test_path: str) -> pd.DataFrame:
    train = pd.read_csv(DATA_PATH + train_path)
    test = pd.read_csv(DATA_PATH + test_path)

    train['text'] = train['STR.REQ']
    train['label'] = train['class']
    train = train.drop(['issueid', 'STR.REQ', 'class'], axis=1)

    test['text'] = test['STR.REQ']
    test['label'] = test['class']
    test = test.drop(['issueid', 'STR.REQ', 'class'], axis=1)

    return train, test

## DeBERTa

In [None]:
evl_results = {
    'eval_accuracy': [],
    'eval_weighted precision': [],
    'eval_weighted recall': [],
    'eval_weighted f1': [],
    'eval_macro precision': [],
    'eval_macro recall': [],
    'eval_macro f1': []
}

In [None]:
from models import DeBERTaModel

for i in range(1, 6):
    print('\n------------------------------')
    print(f'Fine-tuning DeBERTa on fold {i}')

    # Read data for each fold
    train_fold, test_fold = read_data(f'dronology_five_folds/fold_{i}/train_fold_{i}.csv', f'dronology_five_folds/fold_{i}/test_fold_{i}.csv')
    train = Dataset.from_pandas(train_fold)
    test = Dataset.from_pandas(test_fold)
    data = DatasetDict({'train': train, 'test': test})

    # Build model and fine-tune it on each fold
    model_name = 'microsoft/deberta-v3-large'
    deberta_model = DeBERTaModel(model_name, device)
    evl_result = deberta_model.train(data, f'fold{i}', epochs=10, push_to_hub=False)

    # Save evaluation results
    for k, v in evl_result.items():
        if k in evl_results.keys():
            evl_results[k].append(v)

    print(f'Evaluation result of fold {i}')
    print(evl_result)
    print('------------------------------\n')

    os.system(f'rm -r deberta-ReqORNot-fold{i}')


------------------------------
Fine-tuning DeBERTa on fold 1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/303 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.377208,0.828947,0.881758,0.828947,0.838079,0.795341,0.867857,0.808193
2,No log,0.590296,0.736842,0.542936,0.736842,0.625199,0.368421,0.5,0.424242
3,No log,0.458776,0.855263,0.854251,0.855263,0.844169,0.851648,0.757143,0.787115
4,No log,0.374633,0.894737,0.892619,0.894737,0.89289,0.873563,0.848214,0.859649
5,No log,0.514467,0.881579,0.879963,0.881579,0.880585,0.850877,0.839286,0.844792
6,No log,0.66354,0.855263,0.850659,0.855263,0.851283,0.823031,0.789286,0.803525
7,0.325600,0.689223,0.881579,0.879963,0.881579,0.880585,0.850877,0.839286,0.844792
8,0.325600,0.67844,0.868421,0.868421,0.868421,0.868421,0.830357,0.830357,0.830357
9,0.325600,0.723754,0.881579,0.879963,0.881579,0.880585,0.850877,0.839286,0.844792
10,0.325600,0.727567,0.881579,0.879963,0.881579,0.880585,0.850877,0.839286,0.844792


Checkpoint destination directory deberta-ReqORNot-fold1/checkpoint-76 already exists and is non-empty. Saving will proceed but saved results may be invalid.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory deberta-ReqORNot-fold1/checkpoint-152 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory deberta-ReqORNot-fold1/checkpoint-228 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory deberta-ReqORNot-fold1/checkpoint-304 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory deberta-ReqORNot-fold1/checkpoint-380 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory deberta-ReqORNot-fold1/checkpoint-456 already exists and is non-empt

Evaluation result of fold 1
{'eval_loss': 0.3746330142021179, 'eval_accuracy': 0.8947368421052632, 'eval_weighted precision': 0.8926194797338173, 'eval_weighted recall': 0.8947368421052632, 'eval_weighted f1': 0.8928901200369344, 'eval_macro precision': 0.8735632183908046, 'eval_macro recall': 0.8482142857142857, 'eval_macro f1': 0.8596491228070174, 'eval_runtime': 1.5261, 'eval_samples_per_second': 49.8, 'eval_steps_per_second': 12.45, 'epoch': 10.0}
------------------------------


------------------------------
Fine-tuning DeBERTa on fold 2


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/303 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.585381,0.736842,0.542936,0.736842,0.625199,0.368421,0.5,0.424242
2,No log,1.105983,0.671053,0.686887,0.671053,0.678001,0.591879,0.6,0.594623
3,No log,0.97621,0.644737,0.650399,0.644737,0.647469,0.548485,0.55,0.54911
4,No log,0.869129,0.763158,0.738777,0.763158,0.723866,0.702206,0.598214,0.605991
5,No log,1.195996,0.723684,0.728184,0.723684,0.725809,0.647186,0.651786,0.649308
6,No log,1.161293,0.802632,0.799631,0.802632,0.800975,0.745614,0.7375,0.741321
7,0.353700,1.102415,0.828947,0.82111,0.828947,0.820384,0.792896,0.739286,0.75873
8,0.353700,1.153988,0.828947,0.82269,0.828947,0.824244,0.785145,0.755357,0.767803
9,0.353700,1.168158,0.828947,0.82111,0.828947,0.820384,0.792896,0.739286,0.75873
10,0.353700,1.171344,0.828947,0.82111,0.828947,0.820384,0.792896,0.739286,0.75873


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation result of fold 2
{'eval_loss': 1.1539884805679321, 'eval_accuracy': 0.8289473684210527, 'eval_weighted precision': 0.8226898252610589, 'eval_weighted recall': 0.8289473684210527, 'eval_weighted f1': 0.8242439235574247, 'eval_macro precision': 0.7851445663010967, 'eval_macro recall': 0.7553571428571428, 'eval_macro f1': 0.7678025851938894, 'eval_runtime': 1.4409, 'eval_samples_per_second': 52.745, 'eval_steps_per_second': 13.186, 'epoch': 10.0}
------------------------------


------------------------------
Fine-tuning DeBERTa on fold 3


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/304 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.374747,0.76,0.819178,0.76,0.678693,0.876712,0.55,0.520597
2,No log,0.35494,0.866667,0.888017,0.866667,0.871516,0.825746,0.877273,0.843227
3,No log,0.551666,0.84,0.839153,0.84,0.825424,0.837302,0.731818,0.761653
4,No log,0.616566,0.84,0.83466,0.84,0.83002,0.819087,0.747727,0.771805
5,No log,0.72166,0.866667,0.866667,0.866667,0.866667,0.829545,0.829545,0.829545
6,No log,0.682944,0.88,0.882187,0.88,0.880913,0.843915,0.854545,0.848959
7,0.344500,0.828689,0.853333,0.851253,0.853333,0.852114,0.81485,0.804545,0.809425
8,0.344500,0.895478,0.84,0.84,0.84,0.84,0.795455,0.795455,0.795455
9,0.344500,0.958115,0.84,0.84,0.84,0.84,0.795455,0.795455,0.795455
10,0.344500,0.965418,0.84,0.84,0.84,0.84,0.795455,0.795455,0.795455


Evaluation result of fold 3
{'eval_loss': 0.6829436421394348, 'eval_accuracy': 0.88, 'eval_weighted precision': 0.8821869488536155, 'eval_weighted recall': 0.88, 'eval_weighted f1': 0.8809129559185499, 'eval_macro precision': 0.843915343915344, 'eval_macro recall': 0.8545454545454545, 'eval_macro f1': 0.8489594987692997, 'eval_runtime': 1.8027, 'eval_samples_per_second': 41.604, 'eval_steps_per_second': 10.54, 'epoch': 10.0}
------------------------------


------------------------------
Fine-tuning DeBERTa on fold 4


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/304 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.489366,0.733333,0.675799,0.733333,0.642992,0.619863,0.515909,0.46733
2,No log,0.418801,0.773333,0.779048,0.773333,0.718933,0.785714,0.590909,0.592
3,No log,0.381251,0.84,0.885926,0.84,0.848,0.805556,0.875,0.82
4,No log,0.307649,0.946667,0.950282,0.946667,0.944639,0.966102,0.9,0.926901
5,No log,0.563571,0.84,0.885926,0.84,0.848,0.805556,0.875,0.82
6,No log,0.321468,0.92,0.918908,0.92,0.918609,0.909357,0.881818,0.894267
7,0.415400,0.422417,0.906667,0.905514,0.906667,0.905891,0.885338,0.872727,0.878725
8,0.415400,0.456581,0.893333,0.897885,0.893333,0.894885,0.858062,0.879545,0.867725
9,0.415400,0.456657,0.92,0.92,0.92,0.92,0.897727,0.897727,0.897727
10,0.415400,0.443409,0.933333,0.932644,0.933333,0.932779,0.920583,0.906818,0.913375


Evaluation result of fold 4
{'eval_loss': 0.30764856934547424, 'eval_accuracy': 0.9466666666666667, 'eval_weighted precision': 0.9502824858757062, 'eval_weighted recall': 0.9466666666666667, 'eval_weighted f1': 0.9446393762183235, 'eval_macro precision': 0.9661016949152542, 'eval_macro recall': 0.9, 'eval_macro f1': 0.9269005847953217, 'eval_runtime': 1.4933, 'eval_samples_per_second': 50.223, 'eval_steps_per_second': 12.723, 'epoch': 10.0}
------------------------------


------------------------------
Fine-tuning DeBERTa on fold 5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/304 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.465754,0.746667,0.557511,0.746667,0.638372,0.373333,0.5,0.427481
2,No log,0.816731,0.773333,0.756784,0.773333,0.761005,0.69555,0.656955,0.669775
3,No log,0.803583,0.773333,0.769513,0.773333,0.771273,0.69883,0.691729,0.695049
4,No log,1.101079,0.786667,0.767879,0.786667,0.765333,0.724432,0.648496,0.666667
5,No log,1.48279,0.76,0.778328,0.76,0.766984,0.693562,0.717575,0.702381
6,No log,1.374564,0.8,0.791102,0.8,0.794037,0.736229,0.709586,0.720497
7,0.300200,1.564739,0.786667,0.794638,0.786667,0.790061,0.720899,0.735432,0.727273
8,0.300200,1.565123,0.8,0.796686,0.8,0.798182,0.73538,0.726974,0.730926
9,0.300200,1.557975,0.8,0.796686,0.8,0.798182,0.73538,0.726974,0.730926
10,0.300200,1.585325,0.8,0.803636,0.8,0.801663,0.736364,0.744361,0.740125


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation result of fold 5
{'eval_loss': 1.5853251218795776, 'eval_accuracy': 0.8, 'eval_weighted precision': 0.8036363636363636, 'eval_weighted recall': 0.8, 'eval_weighted f1': 0.8016632016632016, 'eval_macro precision': 0.7363636363636363, 'eval_macro recall': 0.744360902255639, 'eval_macro f1': 0.7401247401247402, 'eval_runtime': 1.8289, 'eval_samples_per_second': 41.008, 'eval_steps_per_second': 10.389, 'epoch': 10.0}
------------------------------



In [None]:
for k in evl_results.keys():
    print(k, np.mean(evl_results[k]))

eval_accuracy 0.8700701754385965
eval_weighted precision 0.8702830206721123
eval_weighted recall 0.8700701754385965
eval_weighted f1 0.8688699154788868
eval_macro precision 0.8410176919772272
eval_macro recall 0.8204955570745044
eval_macro f1 0.8286873063380538


## BERT-base-uncased

In [None]:
evl_results = {
    'eval_accuracy': [],
    'eval_weighted precision': [],
    'eval_weighted recall': [],
    'eval_weighted f1': [],
    'eval_macro precision': [],
    'eval_macro recall': [],
    'eval_macro f1': []
}

In [None]:
from models import DeBERTaModel

for i in range(1, 6):
    print('\n------------------------------')
    print(f'Fine-tuning DeBERTa on fold {i}')

    # Read data for each fold
    train_fold, test_fold = read_data(f'dronology_five_folds/fold_{i}/train_fold_{i}.csv', f'dronology_five_folds/fold_{i}/test_fold_{i}.csv')
    train = Dataset.from_pandas(train_fold)
    test = Dataset.from_pandas(test_fold)
    data = DatasetDict({'train': train, 'test': test})

    # Build model and fine-tune it on each fold
    model_name = 'bert-base-uncased'
    deberta_model = DeBERTaModel(model_name, device)
    evl_result = deberta_model.train(data, f'fold{i}', epochs=10, push_to_hub=False)

    # Save evaluation results
    for k, v in evl_result.items():
        if k in evl_results.keys():
            evl_results[k].append(v)

    print(f'Evaluation result of fold {i}')
    print(evl_result)
    print('------------------------------\n')

    os.system(f'rm -r deberta-ReqORNot-fold{i}')


------------------------------
Fine-tuning DeBERTa on fold 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/303 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.477674,0.736842,0.542936,0.736842,0.625199,0.368421,0.5,0.424242
2,No log,0.237425,0.921053,0.939271,0.921053,0.923967,0.884615,0.946429,0.906481
3,No log,0.294164,0.881579,0.91833,0.881579,0.887281,0.844828,0.919643,0.864474
4,No log,0.545217,0.881579,0.878627,0.881579,0.878323,0.860917,0.823214,0.839248
5,No log,0.411541,0.907895,0.931774,0.907895,0.911683,0.87037,0.9375,0.892199
6,No log,0.633445,0.868421,0.865295,0.868421,0.866113,0.837165,0.814286,0.824561
7,0.309000,0.67148,0.881579,0.883755,0.881579,0.88249,0.844589,0.855357,0.849703
8,0.309000,0.568861,0.894737,0.914413,0.894737,0.898623,0.855385,0.9125,0.875308
9,0.309000,0.584437,0.894737,0.899256,0.894737,0.896286,0.858586,0.880357,0.868398
10,0.309000,0.607834,0.894737,0.899256,0.894737,0.896286,0.858586,0.880357,0.868398


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation result of fold 1
{'eval_loss': 0.23742510378360748, 'eval_accuracy': 0.9210526315789473, 'eval_weighted precision': 0.9392712550607287, 'eval_weighted recall': 0.9210526315789473, 'eval_weighted f1': 0.923967013514097, 'eval_macro precision': 0.8846153846153846, 'eval_macro recall': 0.9464285714285714, 'eval_macro f1': 0.9064807219031994, 'eval_runtime': 1.2381, 'eval_samples_per_second': 61.383, 'eval_steps_per_second': 15.346, 'epoch': 10.0}
------------------------------


------------------------------
Fine-tuning DeBERTa on fold 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/303 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.509723,0.75,0.718076,0.75,0.713765,0.665837,0.589286,0.595178
2,No log,0.678442,0.776316,0.772853,0.776316,0.774438,0.710526,0.703571,0.70683
3,No log,1.111636,0.802632,0.790373,0.802632,0.787504,0.758852,0.689286,0.709702
4,No log,1.095195,0.802632,0.791487,0.802632,0.792751,0.751366,0.705357,0.721612
5,No log,1.226181,0.789474,0.796828,0.789474,0.792572,0.73064,0.744643,0.736797
6,No log,1.263972,0.815789,0.810647,0.815789,0.812558,0.764368,0.746429,0.754386
7,0.185300,1.394526,0.815789,0.805967,0.815789,0.804219,0.776498,0.714286,0.734796
8,0.185300,1.327161,0.802632,0.799631,0.802632,0.800975,0.745614,0.7375,0.741321
9,0.185300,1.347233,0.802632,0.799631,0.802632,0.800975,0.745614,0.7375,0.741321
10,0.185300,1.35257,0.828947,0.82269,0.828947,0.824244,0.785145,0.755357,0.767803


Evaluation result of fold 2
{'eval_loss': 1.3525704145431519, 'eval_accuracy': 0.8289473684210527, 'eval_weighted precision': 0.8226898252610589, 'eval_weighted recall': 0.8289473684210527, 'eval_weighted f1': 0.8242439235574247, 'eval_macro precision': 0.7851445663010967, 'eval_macro recall': 0.7553571428571428, 'eval_macro f1': 0.7678025851938894, 'eval_runtime': 1.0654, 'eval_samples_per_second': 71.337, 'eval_steps_per_second': 17.834, 'epoch': 10.0}
------------------------------


------------------------------
Fine-tuning DeBERTa on fold 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/304 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.360116,0.853333,0.861174,0.853333,0.837589,0.87642,0.740909,0.776362
2,No log,0.349793,0.84,0.885926,0.84,0.848,0.805556,0.875,0.82
3,No log,0.481971,0.88,0.896,0.88,0.88381,0.84,0.886364,0.857143
4,No log,0.624643,0.893333,0.896799,0.893333,0.88668,0.906909,0.815909,0.84787
5,No log,0.754849,0.866667,0.879085,0.866667,0.870212,0.824755,0.861364,0.839194
6,No log,0.807629,0.853333,0.86243,0.853333,0.856394,0.809365,0.836364,0.820691
7,0.238100,0.858917,0.866667,0.871927,0.866667,0.868607,0.825901,0.845455,0.834656
8,0.238100,0.855407,0.866667,0.871927,0.866667,0.868607,0.825901,0.845455,0.834656
9,0.238100,0.890375,0.866667,0.871927,0.866667,0.868607,0.825901,0.845455,0.834656
10,0.238100,0.921656,0.866667,0.871927,0.866667,0.868607,0.825901,0.845455,0.834656


Evaluation result of fold 3
{'eval_loss': 0.48197123408317566, 'eval_accuracy': 0.88, 'eval_weighted precision': 0.8959999999999998, 'eval_weighted recall': 0.88, 'eval_weighted f1': 0.8838095238095239, 'eval_macro precision': 0.84, 'eval_macro recall': 0.8863636363636364, 'eval_macro f1': 0.8571428571428572, 'eval_runtime': 1.142, 'eval_samples_per_second': 65.673, 'eval_steps_per_second': 16.637, 'epoch': 10.0}
------------------------------


------------------------------
Fine-tuning DeBERTa on fold 4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/304 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.385856,0.813333,0.874612,0.813333,0.823443,0.785247,0.856818,0.793956
2,No log,0.283788,0.866667,0.888017,0.866667,0.871516,0.825746,0.877273,0.843227
3,No log,0.980911,0.773333,0.843867,0.773333,0.785943,0.748918,0.813636,0.751994
4,No log,0.764302,0.84,0.873556,0.84,0.847059,0.800152,0.859091,0.816176
5,No log,0.90665,0.84,0.873556,0.84,0.847059,0.800152,0.859091,0.816176
6,No log,0.805988,0.866667,0.888017,0.866667,0.871516,0.825746,0.877273,0.843227
7,0.247500,0.881374,0.866667,0.888017,0.866667,0.871516,0.825746,0.877273,0.843227
8,0.247500,0.962126,0.853333,0.880556,0.853333,0.859272,0.8125,0.868182,0.829581
9,0.247500,0.951765,0.853333,0.880556,0.853333,0.859272,0.8125,0.868182,0.829581
10,0.247500,0.862646,0.853333,0.880556,0.853333,0.859272,0.8125,0.868182,0.829581


Evaluation result of fold 4
{'eval_loss': 0.2837880849838257, 'eval_accuracy': 0.8666666666666667, 'eval_weighted precision': 0.8880167451596022, 'eval_weighted recall': 0.8666666666666667, 'eval_weighted f1': 0.8715161649944256, 'eval_macro precision': 0.82574568288854, 'eval_macro recall': 0.8772727272727272, 'eval_macro f1': 0.8432274247491638, 'eval_runtime': 1.4145, 'eval_samples_per_second': 53.024, 'eval_steps_per_second': 13.433, 'epoch': 10.0}
------------------------------


------------------------------
Fine-tuning DeBERTa on fold 5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/304 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.568182,0.72,0.705636,0.72,0.711652,0.617055,0.603853,0.608696
2,No log,0.614589,0.746667,0.793901,0.746667,0.759854,0.696809,0.743421,0.705639
3,No log,1.310577,0.733333,0.801369,0.733333,0.749333,0.696481,0.75188,0.7
4,No log,1.350996,0.76,0.778328,0.76,0.766984,0.693562,0.717575,0.702381
5,No log,1.446009,0.746667,0.751152,0.746667,0.748773,0.668182,0.673872,0.670825
6,No log,1.369638,0.773333,0.762613,0.773333,0.766576,0.696504,0.674342,0.68323
7,0.247900,1.571131,0.746667,0.742339,0.746667,0.744364,0.662281,0.656485,0.659172
8,0.247900,1.690531,0.706667,0.738667,0.706667,0.718365,0.64,0.664474,0.646226
9,0.247900,1.616306,0.76,0.76,0.76,0.76,0.682801,0.682801,0.682801
10,0.247900,1.643727,0.76,0.768748,0.76,0.763818,0.687831,0.700188,0.693182


Evaluation result of fold 5
{'eval_loss': 0.6145890355110168, 'eval_accuracy': 0.7466666666666667, 'eval_weighted precision': 0.7939007092198581, 'eval_weighted recall': 0.7466666666666667, 'eval_weighted f1': 0.7598540246505543, 'eval_macro precision': 0.6968085106382979, 'eval_macro recall': 0.743421052631579, 'eval_macro f1': 0.7056393307167941, 'eval_runtime': 1.2705, 'eval_samples_per_second': 59.031, 'eval_steps_per_second': 14.954, 'epoch': 10.0}
------------------------------



In [None]:
for k in evl_results.keys():
    print(k, np.mean(evl_results[k]))

eval_accuracy 0.8486666666666667
eval_weighted precision 0.8679757069402495
eval_weighted recall 0.8486666666666667
eval_weighted f1 0.8526781301052051
eval_macro precision 0.8064628288886638
eval_macro recall 0.8417686261107313
eval_macro f1 0.8160585839411807


  and should_run_async(code)


## Llama

In [None]:
evl_results = {
    'eval_accuracy': [],
    'eval_weighted precision': [],
    'eval_weighted recall': [],
    'eval_weighted f1': [],
    'eval_macro precision': [],
    'eval_macro recall': [],
    'eval_macro f1': []
}

In [None]:
from models import LlamaModel

for i in range(1, 6):
    print('\n------------------------------')
    print(f'Fine-tuning Llama on fold {i}')

    # Read data for each fold
    train_fold, test_fold = read_data(f'dronology_five_folds/fold_{i}/train_fold_{i}.csv', f'dronology_five_folds/fold_{i}/test_fold_{i}.csv')
    train = Dataset.from_pandas(train_fold)
    test = Dataset.from_pandas(test_fold)
    data = DatasetDict({'train': train, 'test': test})

    # Build model and fine-tune it on each fold
    hf_token = 'YOUR_HF_ACCESS_TOKEN'
    model_name = 'meta-llama/Llama-2-7b-hf'
    llama_model = LlamaModel(model_name, hf_token)
    evl_result = llama_model.train(data, f'fold{i}', epochs=7, push_to_hub=False)

    # Save evaluation results
    for k, v in evl_result.items():
        if k in evl_results.keys():
            evl_results[k].append(v)

    print(f'Evaluation result of fold {i}')
    print(evl_result)
    print('------------------------------\n')

    os.system('rm -r llama-output')


------------------------------
Fine-tuning Llama on fold 1


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA module names: ['v_proj', 'k_proj', 'down_proj', 'gate_proj', 'o_proj', 'up_proj', 'q_proj']




Map:   0%|          | 0/303 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.832142,0.657895,0.657895,0.657895,0.657895,0.558929,0.558929,0.558929
2,No log,0.774715,0.710526,0.692325,0.710526,0.699435,0.610417,0.594643,0.599617
3,No log,0.7174,0.710526,0.701351,0.710526,0.705448,0.618774,0.610714,0.614035
4,No log,0.685633,0.697368,0.692521,0.697368,0.694828,0.605263,0.601786,0.603358
5,No log,0.650898,0.710526,0.701351,0.710526,0.705448,0.618774,0.610714,0.614035
6,No log,0.629237,0.710526,0.701351,0.710526,0.705448,0.618774,0.610714,0.614035
7,No log,0.621607,0.710526,0.701351,0.710526,0.705448,0.618774,0.610714,0.614035


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for 

Evaluation result of fold 1
{'eval_loss': 0.7173997163772583, 'eval_accuracy': 0.7105263157894737, 'eval_weighted precision': 0.7013510788465417, 'eval_weighted recall': 0.7105263157894737, 'eval_weighted f1': 0.7054478301015698, 'eval_macro precision': 0.6187739463601533, 'eval_macro recall': 0.6107142857142858, 'eval_macro f1': 0.6140350877192983, 'eval_runtime': 6.3654, 'eval_samples_per_second': 11.939, 'eval_steps_per_second': 0.471, 'epoch': 7.0}
------------------------------


------------------------------
Fine-tuning Llama on fold 2


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA module names: ['v_proj', 'k_proj', 'down_proj', 'gate_proj', 'o_proj', 'up_proj', 'q_proj']


Map:   0%|          | 0/303 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.98094,0.552632,0.638803,0.552632,0.578669,0.528011,0.535714,0.511716
2,No log,0.74677,0.644737,0.613747,0.644737,0.626952,0.502186,0.501786,0.498901
3,No log,0.655764,0.697368,0.672994,0.697368,0.682218,0.585246,0.569643,0.573138
4,No log,0.622487,0.710526,0.710526,0.710526,0.710526,0.626786,0.626786,0.626786
5,No log,0.605129,0.697368,0.692521,0.697368,0.694828,0.605263,0.601786,0.603358
6,No log,0.596778,0.723684,0.719298,0.723684,0.721364,0.640351,0.635714,0.637849
7,No log,0.59546,0.723684,0.719298,0.723684,0.721364,0.640351,0.635714,0.637849



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must 

Evaluation result of fold 2
{'eval_loss': 0.5967783331871033, 'eval_accuracy': 0.7236842105263158, 'eval_weighted precision': 0.7192982456140351, 'eval_weighted recall': 0.7236842105263158, 'eval_weighted f1': 0.7213643366414675, 'eval_macro precision': 0.6403508771929824, 'eval_macro recall': 0.6357142857142857, 'eval_macro f1': 0.6378488767869299, 'eval_runtime': 8.6681, 'eval_samples_per_second': 8.768, 'eval_steps_per_second': 0.346, 'epoch': 7.0}
------------------------------


------------------------------
Fine-tuning Llama on fold 3


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA module names: ['v_proj', 'k_proj', 'down_proj', 'gate_proj', 'o_proj', 'up_proj', 'q_proj']


Map:   0%|          | 0/304 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,0.991697,0.586667,0.64073,0.586667,0.60621,0.535607,0.543182,0.530208
2,No log,0.79124,0.666667,0.638889,0.666667,0.650104,0.541667,0.534091,0.534161
3,No log,0.713528,0.72,0.690819,0.72,0.698679,0.617866,0.586364,0.592075
4,No log,0.666969,0.706667,0.688418,0.706667,0.695517,0.608581,0.593182,0.597953
5,No log,0.644043,0.706667,0.688418,0.706667,0.695517,0.608581,0.593182,0.597953
6,No log,0.630019,0.706667,0.688418,0.706667,0.695517,0.608581,0.593182,0.597953
7,No log,0.625362,0.72,0.698889,0.72,0.706087,0.625,0.602273,0.608696



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must 

Evaluation result of fold 3
{'eval_loss': 0.6253615617752075, 'eval_accuracy': 0.72, 'eval_weighted precision': 0.698888888888889, 'eval_weighted recall': 0.72, 'eval_weighted f1': 0.706086956521739, 'eval_macro precision': 0.625, 'eval_macro recall': 0.6022727272727273, 'eval_macro f1': 0.6086956521739131, 'eval_runtime': 9.4277, 'eval_samples_per_second': 7.955, 'eval_steps_per_second': 0.318, 'epoch': 7.0}
------------------------------


------------------------------
Fine-tuning Llama on fold 4


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA module names: ['v_proj', 'k_proj', 'down_proj', 'gate_proj', 'o_proj', 'up_proj', 'q_proj']


Map:   0%|          | 0/304 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,1.143322,0.493333,0.569961,0.493333,0.520774,0.458212,0.447727,0.440738
2,No log,0.916709,0.626667,0.586105,0.626667,0.603381,0.467799,0.475,0.467546
3,No log,0.781989,0.653333,0.602116,0.653333,0.621751,0.490079,0.493182,0.483581
4,No log,0.696289,0.64,0.608889,0.64,0.622112,0.5,0.5,0.496894
5,No log,0.651163,0.68,0.635979,0.68,0.650847,0.539683,0.527273,0.523305
6,No log,0.627791,0.68,0.635979,0.68,0.650847,0.539683,0.527273,0.523305
7,No log,0.620163,0.666667,0.626137,0.666667,0.641285,0.524814,0.518182,0.514375



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must 

Evaluation result of fold 4
{'eval_loss': 0.6511630415916443, 'eval_accuracy': 0.68, 'eval_weighted precision': 0.635978835978836, 'eval_weighted recall': 0.68, 'eval_weighted f1': 0.6508474576271186, 'eval_macro precision': 0.5396825396825397, 'eval_macro recall': 0.5272727272727272, 'eval_macro f1': 0.5233050847457628, 'eval_runtime': 4.8421, 'eval_samples_per_second': 15.489, 'eval_steps_per_second': 0.62, 'epoch': 7.0}
------------------------------


------------------------------
Fine-tuning Llama on fold 5


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA module names: ['v_proj', 'k_proj', 'down_proj', 'gate_proj', 'o_proj', 'up_proj', 'q_proj']


Map:   0%|          | 0/304 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Weighted precision,Weighted recall,Weighted f1,Macro precision,Macro recall,Macro f1
1,No log,1.106347,0.52,0.638293,0.52,0.551185,0.516714,0.522086,0.487082
2,No log,0.801118,0.68,0.656889,0.68,0.666694,0.55,0.542293,0.543611
3,No log,0.687267,0.68,0.656889,0.68,0.666694,0.55,0.542293,0.543611
4,No log,0.651171,0.693333,0.709423,0.693333,0.700333,0.610206,0.620771,0.614008
5,No log,0.608773,0.72,0.715166,0.72,0.717455,0.625731,0.621241,0.623296
6,No log,0.594181,0.72,0.715166,0.72,0.717455,0.625731,0.621241,0.623296
7,No log,0.590194,0.72,0.715166,0.72,0.717455,0.625731,0.621241,0.623296



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must 

Evaluation result of fold 5
{'eval_loss': 0.608773410320282, 'eval_accuracy': 0.72, 'eval_weighted precision': 0.7151656920077972, 'eval_weighted recall': 0.72, 'eval_weighted f1': 0.7174551542693135, 'eval_macro precision': 0.6257309941520468, 'eval_macro recall': 0.6212406015037594, 'eval_macro f1': 0.6232958622339153, 'eval_runtime': 6.1031, 'eval_samples_per_second': 12.289, 'eval_steps_per_second': 0.492, 'epoch': 7.0}
------------------------------



In [None]:
for k in evl_results.keys():
    print(k, np.mean(evl_results[k]))

eval_accuracy 0.7108421052631579
eval_weighted precision 0.6941365482672197
eval_weighted recall 0.7108421052631579
eval_weighted f1 0.7002403470322417
eval_macro precision 0.6099076714775444
eval_macro recall 0.5994429254955571
eval_macro f1 0.6014361127319638


  and should_run_async(code)


## Few-Shot Learning

In [None]:
evl_results = {
    'accuracy': [],
    'weighted precision': [],
    'weighted recall': [],
    'weighted f1': [],
    'macro precision': [],
    'macro recall': [],
    'macro f1': []
}

In [None]:
from models import FewShotModel
import gc
from accelerate import Accelerator

accelerator = Accelerator()

for i in range(1, 6):
    print('\n------------------------------')
    print(f'Fine-tuning Few-Shot Model on fold {i}')

    # Read data for each fold
    train_fold, test_fold = read_data(f'dronology_five_folds/fold_{i}/train_fold_{i}.csv', f'dronology_five_folds/fold_{i}/test_fold_{i}.csv')
    train = Dataset.from_pandas(train_fold)
    test = Dataset.from_pandas(test_fold)
    data = DatasetDict({'train': train, 'test': test})

    # Build model and fine-tune it on each fold
    model_name = 'sentence-transformers/all-roberta-large-v1'
    few_shot_model = FewShotModel(model_name, device)
    evl_result = few_shot_model.train(data, f'fold{i}', push_to_hub=False)

    # Save evaluation results
    for k, v in evl_result.items():
        if k in evl_results.keys():
            evl_results[k].append(v)

    print(f'Evaluation result of fold {i}')
    print(evl_result)
    print('------------------------------\n')
    gc.collect()
    torch.cuda.empty_cache()
    accelerator.free_memory()


------------------------------
Fine-tuning Few-Shot Model on fold 1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 1200
  Batch size = 8
  Num epochs = 10
  Total optimization steps = 1500


Step,Training Loss


***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Evaluation result of fold 1
{'accuracy': 0.6973684210526315, 'weighted precision': 0.8377406931964057, 'weighted recall': 0.6973684210526315, 'weighted f1': 0.714541713678901, 'macro precision': 0.7174216027874565, 'macro recall': 0.7785714285714285, 'macro f1': 0.6851017834624392}
------------------------------


------------------------------
Fine-tuning Few-Shot Model on fold 2


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 1200
  Batch size = 8
  Num epochs = 10
  Total optimization steps = 1500


Step,Training Loss


***** Running evaluation *****


Evaluation result of fold 2
{'accuracy': 0.5394736842105263, 'weighted precision': 0.6323124885384193, 'weighted recall': 0.5394736842105263, 'weighted f1': 0.5666650224436443, 'macro precision': 0.5209059233449478, 'macro recall': 0.5267857142857143, 'macro f1': 0.5014058106841612}
------------------------------


------------------------------
Fine-tuning Few-Shot Model on fold 3


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 1200
  Batch size = 8
  Num epochs = 10
  Total optimization steps = 1500


Step,Training Loss


***** Running evaluation *****


Evaluation result of fold 3
{'accuracy': 0.6533333333333333, 'weighted precision': 0.8492753623188406, 'weighted recall': 0.6533333333333333, 'weighted f1': 0.6679653679653679, 'macro precision': 0.717391304347826, 'macro recall': 0.7636363636363637, 'macro f1': 0.6482683982683982}
------------------------------


------------------------------
Fine-tuning Few-Shot Model on fold 4


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 1200
  Batch size = 8
  Num epochs = 10
  Total optimization steps = 1500


Step,Training Loss


***** Running evaluation *****


Evaluation result of fold 4
{'accuracy': 0.76, 'weighted precision': 0.8238372093023255, 'weighted recall': 0.76, 'weighted f1': 0.7729984301412872, 'macro precision': 0.7307412790697674, 'macro recall': 0.7886363636363636, 'macro f1': 0.7350863422291993}
------------------------------


------------------------------
Fine-tuning Few-Shot Model on fold 5


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 1200
  Batch size = 8
  Num epochs = 10
  Total optimization steps = 1500


Step,Training Loss


***** Running evaluation *****


Evaluation result of fold 5
{'accuracy': 0.7466666666666667, 'weighted precision': 0.7708496732026143, 'weighted recall': 0.7466666666666667, 'weighted f1': 0.7554763457219446, 'macro precision': 0.6813725490196079, 'macro recall': 0.7086466165413534, 'macro f1': 0.6902847207128886}
------------------------------



In [None]:
for k in evl_results.keys():
    print(k, np.mean(evl_results[k]))

accuracy 0.6793684210526316
weighted precision 0.782803085311721
weighted recall 0.6793684210526316
weighted f1 0.695529375990229
macro precision 0.6735665317139212
macro recall 0.7132552973342448
macro f1 0.6520294110714173


  and should_run_async(code)
