In [4]:
from dotenv import load_dotenv
load_dotenv()
import os
import huggingface_hub
from datasets import load_dataset
from transformers import DebertaV2Tokenizer, DebertaForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
import torch
import wandb
import numpy as np
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
import optuna

# Function to calculate QWE score (Quadratic Weighted Kappa)
def qwe_score(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")

# Login to Hugging Face and W&B
print("Logging in to Hugging Face Hub and W&B...")
huggingface_hub.login(token=os.getenv('HUGGINGFACE_TOKEN'))
wandb.login(key=os.getenv('WANDB_API_TOKEN'))
print("Login successful.")

# Set up W&B project
wandb_project = 'HSLU-AICOMP-LearningAgencyLab'
wandb_entity = 'jannine-meier'
os.environ["WANDB_PROJECT"] = wandb_project

# Load dataset from Hugging Face hub
print("Loading dataset from Hugging Face...")
huggingface_username = 'HSLU-AICOMP-LearningAgencyLab'
competition = 'learning-agency-lab-automated-essay-scoring-2'
dataset = load_dataset(f"{huggingface_username}/{competition}")
print("Dataset loaded successfully.")

# Initialize tokenizer for DeBERTa
print("Initializing DeBERTa tokenizer...")
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-small')


# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['full_text'], truncation=True, padding='max_length', max_length=256)

# Tokenize the entire dataset
print("Tokenizing datasets...")
dataset = dataset.map(tokenize_function, batched=True)

# Ensure that labels are in the correct format (e.g., float for regression tasks)
dataset = dataset.map(lambda x: {"labels": float(x["score"])})

# Prepare for stratified 5-fold cross-validation based on the 'score' column
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the objective function for Optuna
def objective(trial):
    # Initialize W&B for this trial
    run = wandb.init(
        project=wandb_project,
        entity=wandb_entity,
        config={
            "trial": trial.number,  # Log trial number
        },
        reinit=True  # Allows multiple runs in a single script
    )

    # Hyperparameter tuning with Optuna
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
    batch_size = trial.suggest_categorical('batch_size', [4, 8, 16])
    weight_decay = trial.suggest_uniform('weight_decay', 0.0, 0.3)

    # Log hyperparameters to W&B
    wandb.config.update({
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "weight_decay": weight_decay
    })

    fold_qwe_scores = []
    
    for fold, (train_indices, eval_indices) in enumerate(kfold.split(dataset['train'], dataset['train']['score'])):
        print(f"Starting fold {fold + 1}...")
        wandb.log({"fold": fold + 1})

        # Create train and eval subsets for this fold
        train_dataset_sample = dataset['train'].select(train_indices)
        eval_dataset_sample = dataset['train'].select(eval_indices)

        # Define the model
        # Define the model with ignore_mismatched_sizes=True to handle size mismatch
        model = DebertaForSequenceClassification.from_pretrained(
            'microsoft/deberta-v3-small',
            num_labels=1,
            ignore_mismatched_sizes=True
        )


        # Training arguments
        training_args = TrainingArguments(
            output_dir=f'./results_fold_{fold + 1}',
            evaluation_strategy='epoch',
            logging_strategy='epoch',
            save_strategy='epoch',
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=2,
            weight_decay=weight_decay,
            load_best_model_at_end=True,
            metric_for_best_model='qwe_score',
            report_to='wandb',
            fp16=True
        )

        # Define the evaluation function including QWE score
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = logits.squeeze()
            qwe = qwe_score(labels, predictions.round())
            return {"qwe_score": qwe}

        # Initialize Trainer with Early Stopping
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset_sample,
            eval_dataset=eval_dataset_sample,
            compute_metrics=compute_metrics,
            data_collator=data_collator,
            tokenizer=tokenizer,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
        )

        # Train the model
        print(f"Starting training for fold {fold + 1}...")
        trainer.train()

        # Evaluate the model and log final QWE score to W&B
        print(f"Evaluating model for fold {fold + 1}...")
        metrics = trainer.evaluate()
        qwe_score_value = metrics.get('eval_qwe_score', None)
        wandb.log({f"fold_{fold + 1}_qwe_score": qwe_score_value})

        # Store the QWE score for averaging later
        if qwe_score_value is not None:
            print(f"Fold {fold + 1} QWE Score: {qwe_score_value}")
            fold_qwe_scores.append(qwe_score_value)
        else:
            print(f"Fold {fold + 1}: QWE score could not be retrieved.")

    # Calculate the average QWE score for this trial
    avg_qwe_score = np.mean(fold_qwe_scores) if fold_qwe_scores else 0
    print(f"Average QWE Score for this trial: {avg_qwe_score}")
    wandb.log({"avg_qwe_score": avg_qwe_score})
    
    # Finish the W&B run
    run.finish()
    
    return avg_qwe_score

# Create a study and run trials
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=3)  # Adjust the number of trials for faster tuning

# Log the best trial results to W&B
best_trial = study.best_trial
print(f"Best trial: Learning Rate={best_trial.params['learning_rate']}, Batch Size={best_trial.params['batch_size']}, Weight Decay={best_trial.params['weight_decay']}")
print(f"Best trial QWE score: {best_trial.value}")
wandb.log({
    "best_trial_learning_rate": best_trial.params['learning_rate'],
    "best_trial_batch_size": best_trial.params['batch_size'],
    "best_trial_weight_decay": best_trial.params['weight_decay'],
    "best_trial_qwe_score": best_trial.value
})

print("Hyperparameter tuning and 5-fold cross-validation complete. Best QWE score logged to W&B.")




Logging in to Hugging Face Hub and W&B...
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\janni\.cache\huggingface\token
Login successful
Login successful.
Loading dataset from Hugging Face...
Dataset loaded successfully.
Initializing DeBERTa tokenizer...
Tokenizing datasets...


[I 2024-10-25 14:50:18,252] A new study created in memory with name: no-name-83d0fb2a-4bf6-4b66-b7c1-ae7a4816c71a


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
fold,▁

0,1
fold,1


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_uniform('weight_decay', 0.0, 0.3)


Starting fold 1...


You are using a model of type deberta-v2 to instantiate a model of type deberta. This is not supported for all configurations of models and can yield errors.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.0.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.1.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.

Starting training for fold 1...




  0%|          | 0/2770 [00:00<?, ?it/s]

{'loss': 0.9378, 'grad_norm': 17.615222930908203, 'learning_rate': 3.045661313851176e-05, 'epoch': 1.0}


  0%|          | 0/347 [00:00<?, ?it/s]

{'eval_loss': 0.7419288158416748, 'eval_qwe_score': 0.46234782513124684, 'eval_runtime': 419.1003, 'eval_samples_per_second': 6.607, 'eval_steps_per_second': 0.828, 'epoch': 1.0}
{'loss': 0.6608, 'grad_norm': 8.296608924865723, 'learning_rate': 0.0, 'epoch': 2.0}


  0%|          | 0/347 [00:00<?, ?it/s]

{'eval_loss': 0.7028602361679077, 'eval_qwe_score': 0.5691108267494362, 'eval_runtime': 432.6253, 'eval_samples_per_second': 6.4, 'eval_steps_per_second': 0.802, 'epoch': 2.0}
{'train_runtime': 14653.2093, 'train_samples_per_second': 1.512, 'train_steps_per_second': 0.189, 'train_loss': 0.7992724917855935, 'epoch': 2.0}
Evaluating model for fold 1...


  0%|          | 0/347 [00:00<?, ?it/s]

Fold 1 QWE Score: 0.5691108267494362
Starting fold 2...


You are using a model of type deberta-v2 to instantiate a model of type deberta. This is not supported for all configurations of models and can yield errors.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.0.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.1.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.

Starting training for fold 2...


  0%|          | 0/2770 [00:00<?, ?it/s]

{'loss': 0.9465, 'grad_norm': 5.844042778015137, 'learning_rate': 3.045661313851176e-05, 'epoch': 1.0}


  0%|          | 0/347 [00:00<?, ?it/s]

{'eval_loss': 0.8135122060775757, 'eval_qwe_score': 0.5237167105887746, 'eval_runtime': 415.4503, 'eval_samples_per_second': 6.665, 'eval_steps_per_second': 0.835, 'epoch': 1.0}
{'loss': 0.6623, 'grad_norm': 20.140031814575195, 'learning_rate': 0.0, 'epoch': 2.0}


  0%|          | 0/347 [00:00<?, ?it/s]

{'eval_loss': 0.7047645449638367, 'eval_qwe_score': 0.5633515139272327, 'eval_runtime': 399.2084, 'eval_samples_per_second': 6.936, 'eval_steps_per_second': 0.869, 'epoch': 2.0}
{'train_runtime': 15262.638, 'train_samples_per_second': 1.451, 'train_steps_per_second': 0.181, 'train_loss': 0.8043969770631205, 'epoch': 2.0}
Evaluating model for fold 2...


  0%|          | 0/347 [00:00<?, ?it/s]

Fold 2 QWE Score: 0.5633515139272327
Starting fold 3...


You are using a model of type deberta-v2 to instantiate a model of type deberta. This is not supported for all configurations of models and can yield errors.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.0.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.1.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.

Starting training for fold 3...


  0%|          | 0/2770 [00:00<?, ?it/s]

[W 2024-10-25 23:46:30,607] Trial 0 failed with parameters: {'learning_rate': 6.091322627702352e-05, 'batch_size': 8, 'weight_decay': 0.268466303137943} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\janni\anaconda3\envs\aicomp2\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\janni\AppData\Local\Temp\ipykernel_22304\783718901.py", line 139, in objective
    trainer.train()
  File "c:\Users\janni\anaconda3\envs\aicomp2\Lib\site-packages\transformers\trainer.py", line 2052, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\janni\anaconda3\envs\aicomp2\Lib\site-packages\transformers\trainer.py", line 2452, in _inner_training_loop
    self.optimizer.step()
  File "c:\Users\janni\anaconda3\envs\aicomp2\Lib\site-packages\accelerate\optimizer.py", line 171, in step
    self.optimizer.step(cl

KeyboardInterrupt: 

In [None]:
#pip install sentencepiece