<a href="https://colab.research.google.com/github/MNderi/verityVault/blob/main/SquenceTrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

In [None]:
pip install accelerate -U




In [None]:
pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
# Import necessary libraries
import argparse
from torch import nn
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Features, Value, ClassLabel, load_metric
import numpy as np
import os
import json
from sklearn.metrics import classification_report, confusion_matrix
import glob, shutil

# Function to load and tokenize the data
def load_data(path, class_list, tokenizer_name="", num_classes=13):
    features = Features({'text': Value('string'), 'label': ClassLabel(num_classes, class_list)})
    dataset = load_dataset('csv', data_files={"test": path},
                        delimiter=',', column_names=['text', 'label'],
                        skiprows=1, features=features,
                        keep_in_memory=True)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512 if "nli-bert" in tokenizer_name else None)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    test_dataset = tokenized_datasets["test"]

    return test_dataset

# Main function to train the model
def main(model_name, directory, tokenizer_name, dataset_train, dataset_val, dataset_test, num_classes, class_list=None, problem_type=None, epochs=10, run_num=1):

    features = Features({'text': Value('string'), 'label': ClassLabel(num_classes, class_list)})
    dataset = load_dataset('csv', data_files={"train": dataset_train,
                                            "val": dataset_val,
                                            "test": dataset_test},
                        delimiter=',', column_names=['text', 'label'],
                        skiprows=1, features=features,
                        keep_in_memory=True)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512 if "nli-bert" in tokenizer_name else None)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    train_dataset = tokenized_datasets["train"].shuffle(seed=42)
    eval_dataset = tokenized_datasets["val"]
    test_dataset = tokenized_datasets["test"]

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes, ignore_mismatched_sizes=True, problem_type=problem_type)

    metric = load_metric("accuracy")
    precision = load_metric("precision")
    recall = load_metric("recall")
    f1 = load_metric("f1")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        acc = metric.compute(predictions=predictions, references=labels)["accuracy"]
        prec = precision.compute(predictions=predictions, references=labels, average="weighted")["precision"]
        rec = recall.compute(predictions=predictions, references=labels, average="weighted")["recall"]
        f1w = f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
        f1micro = f1.compute(predictions=predictions, references=labels, average="micro")["f1"]
        return {"accuracy": acc, "precision": prec, "recall": rec, "f1_weighted": f1w, "f1_micro": f1micro}

    training_args = TrainingArguments(output_dir=directory,
                                    do_train=True,
                                    do_eval=True,
                                    evaluation_strategy="epoch",
                                    per_device_train_batch_size=32 if "deberta" not in tokenizer_name and "electra" not in tokenizer_name else 16,
                                    per_device_eval_batch_size=32 if "deberta" not in tokenizer_name and "electra" not in tokenizer_name else 16,
                                    learning_rate=5e-5,
                                    logging_strategy="epoch",
                                    num_train_epochs=epochs,
                                    save_strategy="epoch",
                                    save_total_limit=1,
                                    load_best_model_at_end=True,
                                    metric_for_best_model="eval_f1_weighted",
                                    lr_scheduler_type="cosine")

    class CustomTrainer(Trainer):
        def __init__(self, **kwargs):
            super().__init__(**kwargs)

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    checkpoints = glob.glob(directory + "/checkpoint*")
    for c in checkpoints:
        shutil.rmtree(c)

    trainer.train()
    with open(os.path.join(directory, f"{tokenizer_name.replace('/', '_') + str(run_num)}.log"), "w+") as f:
        f.write(json.dumps(trainer.predict(test_dataset).metrics))

# Argument parsing
model_name = "bert-base-uncased"
directory = "./big_bench_model"
tokenizer_name = "bert-base-uncased"
dataset_train = "/content/drive/MyDrive/Trainer/GeneralClassification/train.csv"
dataset_val = "/content/drive/MyDrive/Trainer/GeneralClassification/dev.csv"
dataset_test = "/content/drive/MyDrive/Trainer/GeneralClassification/test.csv"
num_classes = 2
class_list = ["negative", "positive"]
epochs = 5

main(model_name, directory, tokenizer_name,
    dataset_train,
    dataset_val,
    dataset_test,
    num_classes=num_classes,
    class_list=class_list,
    problem_type="single_label_classification",
    epochs=epochs)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1960 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("accuracy")


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Weighted,F1 Micro
1,0.6019,0.430838,0.780952,0.782141,0.780952,0.779745,0.780952
2,0.3652,0.299087,0.82381,0.866758,0.82381,0.820843,0.82381
3,0.2942,0.29761,0.82619,0.868087,0.82619,0.823376,0.82619
4,0.2454,0.295385,0.82381,0.843939,0.82381,0.822848,0.82381
5,0.2177,0.296495,0.819048,0.823094,0.819048,0.819212,0.819048


In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset, Features, Value, ClassLabel, load_metric
import numpy as np
import os
import json
import shutil
import glob

# Function to load and tokenize the data
def load_data(path, class_list, tokenizer_name="", num_classes=2):
    features = Features({'text': Value('string'), 'label': ClassLabel(num_classes, class_list)})
    dataset = load_dataset('csv', data_files={"test": path},
                        delimiter=',', column_names=['text', 'label'],
                        skiprows=1, features=features,
                        keep_in_memory=True)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512 if "nli-bert" in tokenizer_name else None)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    test_dataset = tokenized_datasets["test"]

    return test_dataset

# Main function to train the model
def main(model_name, directory, tokenizer_name, dataset_train, dataset_val, dataset_test, num_classes, class_list=None, problem_type=None, epochs=5):

    features = Features({'text': Value('string'), 'label': ClassLabel(num_classes, class_list)})
    dataset = load_dataset('csv', data_files={"train": dataset_train,
                                            "val": dataset_val,
                                            "test": dataset_test},
                        delimiter=',', column_names=['text', 'label'],
                        skiprows=1, features=features,
                        keep_in_memory=True)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512 if "nli-bert" in tokenizer_name else None)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    train_dataset = tokenized_datasets["train"].shuffle(seed=42)
    eval_dataset = tokenized_datasets["val"]
    test_dataset = tokenized_datasets["test"]

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes, problem_type=problem_type)

    metric = load_metric("accuracy")
    precision = load_metric("precision")
    recall = load_metric("recall")
    f1 = load_metric("f1")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        acc = metric.compute(predictions=predictions, references=labels)["accuracy"]
        prec = precision.compute(predictions=predictions, references=labels, average="weighted")["precision"]
        rec = recall.compute(predictions=predictions, references=labels, average="weighted")["recall"]
        f1w = f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
        f1micro = f1.compute(predictions=predictions, references=labels, average="micro")["f1"]
        return {"accuracy": acc, "precision": prec, "recall": rec, "f1_weighted": f1w, "f1_micro": f1micro}

    training_args = TrainingArguments(output_dir=directory,
                                    do_train=True,
                                    do_eval=True,
                                    evaluation_strategy="epoch",
                                    per_device_train_batch_size=32 if "deberta" not in tokenizer_name and "electra" not in tokenizer_name else 16,
                                    per_device_eval_batch_size=32 if "deberta" not in tokenizer_name and "electra" not in tokenizer_name else 16,
                                    learning_rate=5e-5,
                                    logging_strategy="epoch",
                                    num_train_epochs=epochs,
                                    save_strategy="epoch",
                                    save_total_limit=1,
                                    load_best_model_at_end=True,
                                    metric_for_best_model="eval_f1_weighted",
                                    lr_scheduler_type="cosine")

    class CustomTrainer(Trainer):
        def __init__(self, **kwargs):
            super().__init__(**kwargs)

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()

    # Save the model after the fourth epoch
    if trainer.state.global_step == 4 * len(train_dataset) // training_args.per_device_train_batch_size:
        trainer.save_model(os.path.join(directory, "/content/drive/MyDrive/Trainer/GeneralClassification/broadmodel"))

    with open(os.path.join(directory, f"{tokenizer_name.replace('/', '_')}.log"), "w+") as f:
        f.write(json.dumps(trainer.predict(test_dataset).metrics))

# Argument parsing
model_name = "bert-base-uncased"
directory = "./big_bench_model"
tokenizer_name = "bert-base-uncased"
dataset_train = "/content/drive/MyDrive/Trainer/GeneralClassification/train.csv"
dataset_val = "/content/drive/MyDrive/Trainer/GeneralClassification/dev.csv"
dataset_test = "/content/drive/MyDrive/Trainer/GeneralClassification/test.csv"
num_classes = 2
class_list = ["negative", "positive"]
epochs = 5

main(model_name, directory, tokenizer_name,
    dataset_train,
    dataset_val,
    dataset_test,
    num_classes=num_classes,
    class_list=class_list,
    problem_type="single_label_classification",
    epochs=epochs)


Map:   0%|          | 0/1960 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Weighted,F1 Micro
1,0.6641,0.570266,0.67381,0.740694,0.67381,0.658339,0.67381
2,0.4498,0.338645,0.804762,0.850963,0.804762,0.800949,0.804762
3,0.2949,0.310576,0.82381,0.869377,0.82381,0.82061,0.82381
4,0.2361,0.323382,0.82381,0.864239,0.82381,0.821068,0.82381
5,0.2144,0.310939,0.82619,0.865601,0.82619,0.823593,0.82619


In [None]:
from transformers import AutoModelForSequenceClassification

# Assuming your model is already loaded
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

# Define path to save the model
save_path = "/content/drive/MyDrive/Trainer/GeneralClassification/model"

# Save the model
model.save_pretrained(save_path)

# Optionally, save the tokenizer too if needed
# tokenizer.save_pretrained(save_path)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install onnxruntime-tools



Collecting onnxruntime-tools
  Downloading onnxruntime_tools-1.7.0-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.7/212.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnx (from onnxruntime-tools)
  Downloading onnx-1.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Collecting coloredlogs (from onnxruntime-tools)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting py3nvml (from onnxruntime-tools)
  Downloading py3nvml-0.2.7-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime-tools)
  Downl

In [None]:
from transformers import convert_graph_to_onnx
from pathlib import Path

# Specify tokenizer and output path
tokenizer_name = "bert-base-uncased"
output_path = Path("/content/drive/MyDrive/Trainer/GeneralClassification/modelonx/model.onnx")

# Convert model to ONNX format, use opset 14
convert_graph_to_onnx.convert(framework="pt", model=save_path, output=output_path, opset=14, tokenizer=tokenizer_name) # Change opset to 14



ONNX opset version set to: 14
Loading pipeline (model: /content/drive/MyDrive/Trainer/GeneralClassification/model, tokenizer: bert-base-uncased)
Using framework PyTorch: 2.3.0+cu121
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Found output output_1 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']


In [None]:
from torch import nn
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Features, Value, ClassLabel, load_metric
import numpy as np
import os
import json
from sklearn.metrics import classification_report, confusion_matrix
import glob
import shutil

def load_data(path, class_list, tokenizer_name="", num_classes=13):
    features = Features({'text': Value('string'), 'label': ClassLabel(num_classes, class_list)})
    dataset = load_dataset('csv', data_files={"test": path},
                           delimiter=',', column_names=['text', 'label'],
                           skiprows=1, features=features,
                           keep_in_memory=True)

    # Load tokenizer and tokenize
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512 if "nli-bert" in tokenizer_name else None)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    test_dataset = tokenized_datasets["test"]

    return test_dataset

def main(model_name, directory, tokenizer_name, num_classes, class_list=None, use_label_encoder=True, problem_type=None, epochs=10, weights=None, run_num=1):

    # Define datasets paths directly here
    big_bench_train_dataset = "/content/drive/MyDrive/Trainer/GeneralClassification/train.csv"
    big_bench_val_dataset = "/content/drive/MyDrive/Trainer/GeneralClassification/dev.csv"
    big_bench_test_dataset = "/content/drive/MyDrive/Trainer/GeneralClassification/test.csv"
    coarse_train_dataset = "/content/drive/MyDrive/Trainer/CoarseClassData/train_dataset.csv"
    coarse_val_dataset = "/content/drive/MyDrive/Trainer/CoarseClassData/val_dataset.csv"
    coarse_test_dataset = "/content/drive/MyDrive/Trainer/CoarseClassData/test_dataset.csv"
    fine_train_dataset = "/content/drive/MyDrive/Trainer/FineClassData/train_dataset.csv"
    fine_val_dataset = "/content/drive/MyDrive/Trainer/FineClassData/val_dataset.csv"
    fine_test_dataset = "/content/drive/MyDrive/Trainer/FineClassData/test_dataset.csv"

    features = Features({'text': Value('string'), 'label': ClassLabel(num_classes, class_list)})
    dataset = load_dataset('csv', data_files={"train": big_bench_train_dataset,
                                              "val": big_bench_val_dataset,
                                              "test": big_bench_test_dataset},
                           delimiter=',', column_names=['text', 'label'],
                           skiprows=1, features=features,
                           keep_in_memory=True)

    # Load tokenizer and tokenize
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512 if "nli-bert" in tokenizer_name else None)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Extract splits
    train_dataset = tokenized_datasets["train"].shuffle(seed=42)
    eval_dataset = tokenized_datasets["val"]
    test_dataset = tokenized_datasets["test"]

    # Load model with new head
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes, ignore_mismatched_sizes=True, problem_type=problem_type)

    # Define metrics
    metric = load_metric("accuracy")
    precision = load_metric("precision")
    recall = load_metric("recall")
    f1 = load_metric("f1")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        acc = metric.compute(predictions=predictions, references=labels)["accuracy"]
        prec = precision.compute(predictions=predictions, references=labels, average="weighted")["precision"]
        rec = recall.compute(predictions=predictions, references=labels, average="weighted")["recall"]
        f1w = f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
        f1micro = f1.compute(predictions=predictions, references=labels, average="micro")["f1"]
        return {"accuracy": acc, "precision": prec, "recall": rec, "f1_weighted": f1w, "f1_micro": f1micro}

    # Define training arguments for trainer
    training_args = TrainingArguments(output_dir=directory,
                                      do_train=True,
                                      do_eval=True,
                                      evaluation_strategy="epoch",
                                      per_device_train_batch_size=32 if "deberta" not in tokenizer_name and "electra" not in tokenizer_name else 16,
                                      per_device_eval_batch_size=32 if "deberta" not in tokenizer_name and "electra" not in tokenizer_name else 16,
                                      learning_rate=5e-5,
                                      logging_strategy="epoch",
                                      num_train_epochs=epochs,
                                      save_strategy="epoch",
                                      save_total_limit=1,
                                      load_best_model_at_end=True,
                                      metric_for_best_model="eval_f1_weighted",
                                      lr_scheduler_type="cosine")

    class CustomTrainer(Trainer):
        def __init__(self, weights, **kwargs):
            super().__init__(**kwargs)
            self.weights = weights

        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.get("labels")
            # forward pass
            outputs = model(**inputs)
            logits = outputs.get('logits')
            # compute custom loss
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(self.weights, device="cuda") if self.weights else None)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            return (loss, outputs) if return_outputs else loss

    # Define trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        weights=weights
    )

    checkpoints = glob.glob(directory + "/checkpoint*")
    for c in checkpoints:
        shutil.rmtree(c)

    # Train
    trainer.train()
    with open(os.path.join(directory, f"{tokenizer_name.replace('/', '_') + str(run_num)}.log"), "w+") as f:
        f.write(json.dumps(trainer.predict(test_dataset).metrics))

    if num_classes == 13:
        class_list = ['faulty generalization', 'false causality', 'circular reasoning', 'ad populum', 'ad hominem', 'fallacy of logic', 'appeal to emotion', 'false dilemma', 'equivocation', 'fallacy of extension', 'fallacy of relevance', 'fallacy of credibility', 'intentional']
        test_dataset = load_data("/content/drive/MyDrive/Trainer/FineClassData/test_dataset.csv", class_list, tokenizer_name=tokenizer_name, num_classes=len(class_list))
        y_pred = torch.tensor(trainer.predict(test_dataset).predictions, dtype=torch.float32).max(1).indices.numpy()
        print("Classification report:\n\n", classification_report(test_dataset["label"], y_pred, target_names=class_list, digits=4))
        print(confusion_matrix(test_dataset["label"], y_pred, labels=list(range(0, len(class_list), 1))))


if __name__ == "__main__":
    main(
        model_name="bert-base-uncased",
        directory="output_directory",
        tokenizer_name="bert-base-uncased",
        num_classes=2,
        class_list=["negative", "positive"],
        problem_type="single_label_classification",
        epochs=5,
        weights=None,
        run_num=1
    )


Map:   0%|          | 0/1960 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Weighted,F1 Micro
1,0.6536,0.486203,0.728571,0.815812,0.728571,0.714776,0.728571
2,0.3972,0.310918,0.828571,0.828909,0.828571,0.828665,0.828571
3,0.28,0.288554,0.838095,0.854062,0.838095,0.837555,0.838095
4,0.2367,0.290093,0.830952,0.855732,0.830952,0.829678,0.830952
5,0.2094,0.289764,0.816667,0.830048,0.816667,0.81622,0.816667


In [None]:
from transformers import AutoModelForSequenceClassification

# Assuming your model is already loaded
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

# Define path to save the model
save_path = "/content/drive/MyDrive/Trainer/GeneralClassification/model"

# Save the model
model.save_pretrained(save_path)

# Optionally, save the tokenizer too if needed
# tokenizer.save_pretrained(save_path)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
