In [1]:

from dotenv import load_dotenv
load_dotenv()
import os
import huggingface_hub
from datasets import load_dataset, Dataset
from transformers import EarlyStoppingCallback, DebertaTokenizer, DebertaForSequenceClassification, TrainingArguments, Trainer
import torch
import torch.nn as nn
import wandb
import pandas as pd

import subprocess
import sys
import numpy as np


# Function to uninstall and install packages
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "numpy"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install the compatible version of NumPy
install("numpy<2.0")


# Print the installed NumPy version for verification
print("NumPy version after installation:", np.__version__)

# Set environment variables
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

# Load dataset from Hugging Face hub
huggingface_username = 'HSLU-AICOMP-LearningAgencyLab'
competition = 'learning-agency-lab-automated-essay-scoring-2'

wandb_project = 'HSLU-AICOMP-LearningAgencyLab'
wandb_entity = 'jannine-meier'

# Login to Hugging Face and W&B
print("Logging in to Hugging Face Hub and W&B...")
huggingface_hub.login(token=os.getenv('HUGGINGFACE_TOKEN'))
wandb.login(key=os.getenv('WANDB_API_TOKEN'))
print("Login successful.")

# Set up W&B project
os.environ["WANDB_PROJECT"] = wandb_project

# Define the sample size for testing
sample_size = 100  # Total number of examples in the sample
train_sample_size = int(0.8 * sample_size)  # 80% for training
eval_sample_size = sample_size - train_sample_size  # 20% for evaluation

# Load the dataset from Hugging Face
print("Loading dataset from Hugging Face...")
dataset = load_dataset(f"{huggingface_username}/{competition}")
print("Dataset loaded successfully.")

# Create smaller samples for testing
# Create smaller samples for testing
print(f"Creating a smaller sample of {sample_size} examples for training and evaluation...")
train_dataset_sample = dataset['train'].select(range(train_sample_size))
eval_dataset_sample = dataset['train'].select(range(train_sample_size, train_sample_size + eval_sample_size))
print(f"Sample size: {len(train_dataset_sample)} training examples and {len(eval_dataset_sample)} evaluation examples.")





NumPy version after installation: 1.26.4
Logging in to Hugging Face Hub and W&B...
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


Token is valid (permission: write).
Your token has been saved to C:\Users\janni\.cache\huggingface\token
Login successful


[34m[1mwandb[0m: Currently logged in as: [33mjannine-meier[0m ([33mnlp_janninemeier[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\janni\_netrc


Login successful.
Loading dataset from Hugging Face...
Dataset loaded successfully.
Creating a smaller sample of 100 examples for training and evaluation...
Sample size: 80 training examples and 20 evaluation examples.


In [2]:
print("Columns in the training dataset sample:", train_dataset_sample.column_names)


Columns in the training dataset sample: ['essay_id', 'full_text', 'score', 'unique_mistakes', 'repeated_mistakes_count', 'max_repeated_mistake', 'word_count', 'flesch_reading_ease', 'flesch_kincaid_grade']


In [3]:
# Initialize tokenizer for DeBERTa
print("Initializing DeBERTa tokenizer...")
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples['full_text'], truncation=True, padding='max_length', max_length=512)

Initializing DeBERTa tokenizer...


In [4]:
# Tokenize the datasets using the 'full_text' column
print("Tokenizing training dataset sample...")
train_dataset_sample = train_dataset_sample.map(tokenize_function, batched=True)
print("Training dataset tokenized successfully.")

print("Tokenizing evaluation dataset sample...")
eval_dataset_sample = eval_dataset_sample.map(tokenize_function, batched=True)
print("Evaluation dataset tokenized successfully.")

# Convert labels to float using a different method
print("Converting labels to float using DataFrame transformation...")
train_dataset_sample = train_dataset_sample.to_pandas()
train_dataset_sample['score'] = train_dataset_sample['score'].astype(float)
train_dataset_sample = Dataset.from_pandas(train_dataset_sample)

eval_dataset_sample = eval_dataset_sample.to_pandas()
eval_dataset_sample['score'] = eval_dataset_sample['score'].astype(float)
eval_dataset_sample = Dataset.from_pandas(eval_dataset_sample)
print("Labels converted to float.")

# Rename the 'score' column to 'labels' for training
train_dataset_sample = train_dataset_sample.rename_column("score", "labels")
eval_dataset_sample = eval_dataset_sample.rename_column("score", "labels")

# Format datasets for PyTorch
print("Formatting datasets for PyTorch...")
train_dataset_sample.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset_sample.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
print("Datasets formatted successfully.")


Tokenizing training dataset sample...
Training dataset tokenized successfully.
Tokenizing evaluation dataset sample...
Evaluation dataset tokenized successfully.
Converting labels to float using DataFrame transformation...
Labels converted to float.
Formatting datasets for PyTorch...
Datasets formatted successfully.


In [5]:
# Define DeBERTa model for regression
print("Loading DeBERTa model for sequence classification...")
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=1)
print("Model loaded successfully.")

Loading DeBERTa model for sequence classification...


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully.


In [6]:
# Custom MSE Loss
class MSELoss(nn.Module):
    def forward(self, logits, labels):
        loss = nn.MSELoss()
        return loss(logits.view(-1), labels.view(-1))

In [7]:
# Training arguments with logging to W&B
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    report_to='wandb',
    save_strategy='epoch',
    metric_for_best_model='eval_loss'
)
print("Training arguments set.")

Setting up training arguments...
Training arguments set.




In [8]:
# Define evaluation metric (MSE)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.flatten()
    mse = ((predictions - labels) ** 2).mean()
    return {"mse": mse}

In [9]:
# Initialize the Trainer
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_sample,
    eval_dataset=eval_dataset_sample,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
print("Trainer initialized successfully.")

# Train the model
print("Starting model training...")
trainer.train()
print("Training completed.")



# # Evaluate the model
# print("Evaluating the model...")
# evaluation_results = trainer.evaluate()
# print("\nEvaluation Results:")
# print(evaluation_results)

Initializing Trainer...
Trainer initialized successfully.
Starting model training...




  0%|          | 0/50 [00:00<?, ?it/s]

{'loss': 5.2195, 'grad_norm': 30.267826080322266, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.0042355060577393, 'eval_mse': 1.0042356252670288, 'eval_runtime': 18.7595, 'eval_samples_per_second': 1.066, 'eval_steps_per_second': 0.16, 'epoch': 1.0}
{'loss': 0.9815, 'grad_norm': 26.720014572143555, 'learning_rate': 1.2e-05, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.9802132844924927, 'eval_mse': 0.9802131652832031, 'eval_runtime': 18.2407, 'eval_samples_per_second': 1.096, 'eval_steps_per_second': 0.164, 'epoch': 2.0}
{'loss': 0.8606, 'grad_norm': 7.429288864135742, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.7756630182266235, 'eval_mse': 0.7756629586219788, 'eval_runtime': 17.8185, 'eval_samples_per_second': 1.122, 'eval_steps_per_second': 0.168, 'epoch': 3.0}
{'loss': 0.569, 'grad_norm': 19.62665367126465, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6897901296615601, 'eval_mse': 0.6897901296615601, 'eval_runtime': 27.325, 'eval_samples_per_second': 0.732, 'eval_steps_per_second': 0.11, 'epoch': 4.0}
{'loss': 0.464, 'grad_norm': 7.6206889152526855, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6521454453468323, 'eval_mse': 0.6521453857421875, 'eval_runtime': 29.9953, 'eval_samples_per_second': 0.667, 'eval_steps_per_second': 0.1, 'epoch': 5.0}
{'train_runtime': 1229.058, 'train_samples_per_second': 0.325, 'train_steps_per_second': 0.041, 'train_loss': 1.6189266967773437, 'epoch': 5.0}
Training completed.


In [10]:
import numpy as np
from sklearn.metrics import accuracy_score

# Define a function to convert continuous scores into grades (1 to 6)
def bin_to_grades(scores):
    # Use np.digitize to categorize scores into grades 1 through 6
    # Bins define the cutoff points for each grade, e.g., 0.5 to 1.5 is grade 1, etc.
    bins = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5]  # The edges for binning
    grades = np.digitize(scores, bins, right=True)
    return grades

# Get the model predictions and labels from the evaluation dataset
print("Evaluating the model...")
predictions, labels = trainer.predict(eval_dataset_sample)
predictions = predictions.flatten()
labels = labels.flatten()

# Convert both predictions and labels into grade categories
predicted_grades = bin_to_grades(predictions)
true_grades = bin_to_grades(labels)

# Calculate the accuracy of the predicted grades
accuracy = accuracy_score(true_grades, predicted_grades)
print(f"Grade-based Accuracy: {accuracy * 100:.2f}%")


Evaluating the model...


  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 2)

In [10]:
# Make predictions on the test set
print("Generating predictions on the test set...")
predictions = trainer.predict(eval_dataset_sample`)
predicted_scores = predictions.predictions.flatten()
print("Predictions generated successfully.")

# Create a DataFrame for submission
# print("Creating submission DataFrame...")
# test_data_df = pd.DataFrame(test_dataset['id'])
# test_data_df['predicted_score'] = predicted_scores
# submission_path = 'submission.csv'
# test_data_df[['id', 'predicted_score']].to_csv(submission_path, index=False)
# print(f"Submission file saved to {submission_path}.")

# Finalize W&B run
print("Finalizing W&B run...")
wandb.finish()
print("All done.")

SyntaxError: invalid syntax (2910020108.py, line 3)