In [None]:
import os
from datasets import Dataset, load_dataset
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer
from sentence_transformers.losses import SoftmaxLoss
from transformers import EarlyStoppingCallback
from sentence_transformers.evaluation import SequentialEvaluator
import pandas as pd

# Set environment variables to optimize CPU usage
os.environ["OMP_NUM_THREADS"] = "4"  # Set to the number of CPU cores available
os.environ["MKL_NUM_THREADS"] = "4"

# Define the dataset and model
dataset_name = "HSLU-AICOMP-LearningAgencyLab/featuers_labels_combined"
base_model_username = "HSLU-AICOMP-LearningAgencyLab"
base_model_name = "automated-essay-scoring-setfit"

# Load the model
model = SentenceTransformer(f"{base_model_username}/{base_model_name}")
print(f"Loaded model: {base_model_username}/{base_model_name} (CPU-only)")

# Load and preprocess the dataset
dataset = load_dataset(dataset_name)
print(f"Loaded dataset: {dataset_name}")
df = pd.DataFrame(dataset['train'])
print(f"Dataset shape: {df.shape}")

# Filter out rows where 'In_Persuade_Corpus' is True
filtered_df = df[df["In_Persuade_Corpus"] == False]
print(f"Filtered dataset shape (excluding 'In_Persuade_Corpus'): {filtered_df.shape}")

# Group by 'score' and sample 5 instances per score for training (smaller sample size)
train_df = filtered_df.groupby("score").apply(lambda x: x.sample(5, random_state=42))
train_df = train_df.droplevel(0).reset_index(drop=True)
print(f"Training dataset shape: {train_df.shape}")

# Create a mask for rows in the training set
train_mask = filtered_df.index.isin(train_df.index)

# Use the remaining rows as the evaluation set
eval_df = filtered_df[~train_mask]
print(f"Evaluation dataset shape: {eval_df.shape}")

# Adjust scores to start from 0
train_df["score"] = train_df["score"] - 1
eval_df["score"] = eval_df["score"] - 1
print("Adjusted scores in training dataset:", train_df["score"].unique())
print("Adjusted scores in evaluation dataset:", eval_df["score"].unique())

# Prepare the dataset for SoftmaxLoss
# Duplicate `full_text` as sentence1 and sentence2 for compatibility with SoftmaxLoss
train_df["sentence1"] = train_df["full_text"]
train_df["sentence2"] = train_df["full_text"]

eval_df["sentence1"] = eval_df["full_text"]
eval_df["sentence2"] = eval_df["full_text"]

# Convert to Dataset format
train_dataset = Dataset.from_pandas(train_df[["sentence1", "sentence2", "score"]])
eval_dataset = Dataset.from_pandas(eval_df[["sentence1", "sentence2", "score"]])

# Prepare the loss function
embedding_dim = model.get_sentence_embedding_dimension()
num_labels = train_df["score"].nunique()
print(f"Number of labels (scores): {num_labels}")

softmax_loss = SoftmaxLoss(
    model, sentence_embedding_dimension=embedding_dim, num_labels=num_labels
)

# Define evaluation metrics
evaluators = []
evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1] if scores else 0)

# Define training arguments
from sentence_transformers import SentenceTransformerTrainingArguments

# Reduce batch size and number of epochs for CPU efficiency
training_args = SentenceTransformerTrainingArguments(
    output_dir=f"./{base_model_name}-fine-tuned",
    overwrite_output_dir=True,
    eval_strategy="steps",  # Updated deprecated argument
    eval_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_sequential_score",
    greater_is_better=True,
    per_device_train_batch_size=2,  # Reduced batch size
    per_device_eval_batch_size=2,
    num_train_epochs=3,  # Fewer epochs
    warmup_steps=100,  # Adjusted warmup steps
    logging_dir="./logs",
    logging_steps=100,
    save_steps=1000,
)

# Define callbacks
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.01,
)

# Train the model
trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=softmax_loss,
    args=training_args,
    evaluator=evaluator,
    callbacks=[early_stopping],
)

print("Starting CPU-optimized training...")
trainer.train()

# Evaluate the model
print("Evaluating the model...")
eval_results = trainer.evaluate(eval_dataset)
print("Evaluation Results:", eval_results)

# Save the model locally
#trainer.save_model("./fine_tuned_model")
print("Model saved locally!")


Loaded model: HSLU-AICOMP-LearningAgencyLab/automated-essay-scoring-setfit (CPU-only)
Loaded dataset: HSLU-AICOMP-LearningAgencyLab/featuers_labels_combined
Dataset shape: (13845, 39)
Filtered dataset shape (excluding 'In_Persuade_Corpus'): (3485, 39)
Training dataset shape: (30, 39)
Evaluation dataset shape: (3477, 39)
Adjusted scores in training dataset: [0 1 2 3 4 5]
Adjusted scores in evaluation dataset: [3 1 2 4 0 5]


  train_df = filtered_df.groupby("score").apply(lambda x: x.sample(5, random_state=42))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df["score"] = eval_df["score"] - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df["sentence1"] = eval_df["full_text"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df["sentence2"] = eval_df["f

Number of labels (scores): 6
Starting CPU-optimized training...


  0%|          | 0/45 [00:00<?, ?it/s]

In [27]:
print("Unique scores in training dataset:", train_df["score"].unique())
print("Unique scores in evaluation dataset:", eval_df["score"].unique())


Unique scores in training dataset: [1 2 3 4 5 6]
Unique scores in evaluation dataset: [4 2 3 5 1 6]
