In [69]:
# Required libraries
from dotenv import load_dotenv
load_dotenv()
import os
import huggingface_hub
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
import wandb
import pandas as pd
import numpy as np

# Set environment variables
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

# Hugging Face and Weights & Biases setup
huggingface_username = 'HSLU-AICOMP-LearningAgencyLab'
competition = 'learning-agency-lab-automated-essay-scoring-2'

wandb_project = 'HSLU-AICOMP-LearningAgencyLab'
wandb_entity = 'jannine-meier'

# Login to Hugging Face and W&B
print("Logging in to Hugging Face Hub and W&B...")
huggingface_hub.login(token=os.getenv('HUGGINGFACE_TOKEN'))
wandb.login(key=os.getenv('WANDB_API_TOKEN'))
print("Login successful.")

# Initialize a W&B run
print("Initializing a W&B run...")
wandb.init(project=wandb_project, entity=wandb_entity, config={
    "max_iter": 1000, #set it to whatever above 1000 - converges early anywy
    "C": 10, # 0.1 strong - 1 moderate - 10 weak (overfitting potential)
    "cv_folds": 10 # the higher the more training data (10 = 90%, 5 = 20%)
})
print("W&B run initialized.")

# Load the entire dataset from Hugging Face
print("Loading the entire dataset from Hugging Face...")
dataset = load_dataset(f"{huggingface_username}/{competition}")
print("Dataset loaded successfully.")

# Split the dataset into training and evaluation sets
train_df = dataset['train'].to_pandas()
eval_df = dataset['eval'].to_pandas()
print(f"Dataset split into {len(train_df)} training examples and {len(eval_df)} evaluation examples.")


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\janni\_netrc


Logging in to Hugging Face Hub and W&B...
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\janni\.cache\huggingface\token
Login successful
Login successful.
Initializing a W&B run...


W&B run initialized.
Loading the entire dataset from Hugging Face...
Dataset loaded successfully.
Dataset split into 13845 training examples and 3462 evaluation examples.


In [70]:
# Preprocess the text data using TF-IDF
print("Applying TF-IDF vectorization to the text data...")
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(train_df['full_text'])
X_eval = vectorizer.transform(eval_df['full_text'])
print("TF-IDF vectorization completed.")

# Target labels
y_train = train_df['score']
y_eval = eval_df['score']
print("Extracted target labels for training and evaluation.")

# Define the model with regularization and max iterations
model = LogisticRegression(max_iter=wandb.config.max_iter, C=wandb.config.C)

Applying TF-IDF vectorization to the text data...
TF-IDF vectorization completed.
Extracted target labels for training and evaluation.


In [71]:
# Set up cross-validation
cv_folds = wandb.config.cv_folds
skf = StratifiedKFold(n_splits=cv_folds)
print(f"Starting {cv_folds}-fold cross-validation...")

# Perform cross-validation and log metrics
cv_scores = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Starting fold {fold + 1}...")
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Train the model on the current fold
    model.fit(X_train_fold, y_train_fold)
    
    # Evaluate on the validation fold
    y_val_pred = model.predict(X_val_fold)
    qwk = cohen_kappa_score(y_val_fold, y_val_pred, weights='quadratic')
    
    # Log metrics for the current fold
    print(f"Fold {fold + 1}, QWK: {qwk}")
    wandb.log({f"fold_{fold+1}_qwk": qwk})
    
    # Store the score for averaging later
    cv_scores.append(qwk)

Starting 10-fold cross-validation...
Starting fold 1...
Fold 1, QWK: 0.6320197283977413
Starting fold 2...
Fold 2, QWK: 0.6220322023105613
Starting fold 3...
Fold 3, QWK: 0.6246417847977237
Starting fold 4...
Fold 4, QWK: 0.6384844553674451
Starting fold 5...
Fold 5, QWK: 0.6633189612014373
Starting fold 6...
Fold 6, QWK: 0.6208431588045114
Starting fold 7...
Fold 7, QWK: 0.6220610289190712
Starting fold 8...
Fold 8, QWK: 0.6427582464670303
Starting fold 9...
Fold 9, QWK: 0.6552256987039596
Starting fold 10...
Fold 10, QWK: 0.6640211862625585


In [72]:
# Calculate and log the average metrics across all folds
avg_qwk = np.mean(cv_scores)
print(f"Average QWK across {cv_folds} folds: {avg_qwk}")
wandb.log({"avg_qwk": avg_qwk})

# Evaluate the model on the evaluation set
print("Evaluating on the test set with the entire training data...")
model.fit(X_train, y_train)
y_pred_eval = model.predict(X_eval)
qwk_eval = cohen_kappa_score(y_eval, y_pred_eval, weights='quadratic')
print(f"Evaluation - QWK: {qwk_eval}")

# Log final evaluation results to W&B
wandb.log({"eval_qwk": qwk_eval})

# Finish the W&B run
wandb.finish()
print("W&B run finished.")

Average QWK across 10 folds: 0.6385406451232039
Evaluating on the test set with the entire training data...
Evaluation - QWK: 0.6522857466435952


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_qwk,▁
eval_qwk,▁
fold_10_qwk,▁
fold_1_qwk,▁
fold_2_qwk,▁
fold_3_qwk,▁
fold_4_qwk,▁
fold_5_qwk,▁
fold_6_qwk,▁
fold_7_qwk,▁

0,1
avg_qwk,0.63854
eval_qwk,0.65229
fold_10_qwk,0.66402
fold_1_qwk,0.63202
fold_2_qwk,0.62203
fold_3_qwk,0.62464
fold_4_qwk,0.63848
fold_5_qwk,0.66332
fold_6_qwk,0.62084
fold_7_qwk,0.62206


W&B run finished.
