In [None]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from datasets import *
import evaluate
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import mlflow
from mlflow import log_metric, log_param
import dagshub
import matplotlib.pyplot as plt

dagshub.init(repo_owner='matealukiccc', repo_name='MLOps-For-NLP', mlflow=True)
mlflow.set_tracking_uri(uri="https://dagshub.com/MateaLukiccc/MLOps-For-NLP.mlflow")
mlflow.set_experiment("ELECTRA")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples["text"], padding=True, truncation=True, max_length=512)
    tokenized_inputs["labels"] = examples["label"]
    return tokenized_inputs

In [None]:
ds = load_dataset("csv", data_files="data/preprocessed_train.csv")
ds2 = load_dataset("csv", data_files="data/preprocessed_test.csv")

train_testvalid = ds['train'].train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': train_testvalid['test'],
    'valid': ds2['train']})

tokenized_ds = train_test_valid_dataset.map(preprocess_function, batched=True)
print(tokenized_ds["train"][0])

In [None]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    print(f"Predictions shape: {predictions.shape}, Labels shape: {labels.shape}")
    predictions = np.argmax(predictions, axis=1)
    accuracy_result = accuracy.compute(predictions=predictions, references=labels)
    precision_result = precision.compute(predictions=predictions, references=labels, average='macro') 
    recall_result = recall.compute(predictions=predictions, references=labels, average='macro') 
    f1_result = f1.compute(predictions=predictions, references=labels, average='macro')  

    return {
        "accuracy": accuracy_result["accuracy"],
        "precision": precision_result["precision"],
        "recall": recall_result["recall"],
        "f1": f1_result["f1"],
    }

def compute_confusion_matrix(trainer, eval_dataset):
    # Predikcija na eval_dataset
    predictions = trainer.predict(eval_dataset)
    pred_labels = np.argmax(predictions.predictions, axis=1)
    true_labels = np.array(eval_dataset['label'])
    
    # Izračunavanje konfuzione matrice
    cm = confusion_matrix(true_labels, pred_labels)
    return cm

In [None]:
def make_model_contiguous(model):
    for param in model.parameters():
        if not param.is_contiguous():
            param.data = param.data.contiguous()

class CustomTrainer(Trainer):
    def on_epoch_end(self, args, state, control):
        super().on_epoch_end(args, state, control)
        make_model_contiguous(self.model)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=4
)

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
with mlflow.start_run() as run:
    mlflow.log_param("embedding_dim", None)
    mlflow.log_param("hidden_size", None)
    mlflow.log_param("optimizer", "AdamW")
    mlflow.log_param("learning_rate", 2e-5)
    trainer.train()

In [None]:
for param in trainer.model.parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

trainer.save_model("trained_model")

# alternative
# mlflow.transformers.save_model(
#         transformers_model={"model": trainer.model, "tokenizer": trainer.tokenizer},
#         path="trained_model_2",
#         task="text-classification"
#     )
mlflow.log_artifacts("trained_model") 

In [None]:
test_metrics = trainer.evaluate(eval_dataset=tokenized_ds["test"])
mlflow.log_metric("accuracy", test_metrics['eval_accuracy'])
mlflow.log_metric("f1_score", test_metrics['eval_f1'])
mlflow.log_metric("recall", test_metrics['eval_recall'])

mlflow.log_param("embedding_dim", None)
mlflow.log_param("hidden_size", None)
mlflow.log_param("optimizer", "AdamW")
mlflow.log_param("learning_rate", 2e-5)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=compute_confusion_matrix(trainer, tokenized_arxiv["test"]))
disp.plot(cmap='viridis')
plt.savefig("confusion_matrix.png")
mlflow.log_artifact("confusion_matrix.png")