# Load Data
- Preprocessing Functions
    - Fixing encoding and decoding errors
    - Normalization
    - Normalize whitespace and adjusts punctuation formatting, preparing the text for model input.
- Split into train and validation sets

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"
import warnings
warnings.filterwarnings("ignore")
import pandas as pd 
from sklearn.model_selection import train_test_split
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs
import re
import torch
from transformers import Trainer
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup, EarlyStoppingCallback
from sklearn.metrics import cohen_kappa_score
import numpy as np

# Load the dataset
df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
df['label'] = df['score'] - 1  # Offset scores by 1 (0 to 5) 

# Preprocessing Functions
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register encoding handlers
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve encoding problems and normalize abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

df['full_text'] = df['full_text'].apply(lambda x : resolve_encodings_and_normalize(x))

def preprocess_essay_text(text: str) -> str:
    """
    Prepares essay text for scoring by cleaning non-essential issues without altering quality indicators.
    - Resolves encoding issues
    - Normalizes whitespace
    - Preserves original spelling, grammar, and casing
    """
    text = resolve_encodings_and_normalize(text)
    text = re.sub(r'\s+', ' ', text.strip())  # Normalize whitespace
    text = re.sub(r'\s+([?.!,"])', r'\1', text)  # Remove spaces before punctuation
    text = re.sub(r',([^\s])', r', \1', text)    # Add space after commas
    return text

df['full_text'] = df['full_text'].apply(preprocess_essay_text)

In [2]:
# Split 
train, valid = train_test_split(df, test_size=0.15, stratify=df['score'], random_state=42)

In [3]:
!pip install transformers datasets evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


# Load the Tokenizer
- Create train and test sets for model training

In [4]:
# Tokenizer and Model Setup
model_id = 'HuggingFaceTB/SmolLM2-360M-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_id)

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['full_text']
        label = self.dataframe.iloc[idx]['label']
        
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        label_tensor = torch.tensor(label)

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': label_tensor}

# Prepare train and validation datasets
train_dataset = CustomDataset(train, tokenizer)
valid_dataset = CustomDataset(valid, tokenizer)

tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

#### Optimal Padding 

In [5]:
# Data collator for padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluation Fn

In [6]:
# Evaluation metric (Quadratic Weighted Kappa)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"quadratic_weighted_kappa": cohen_kappa_score(labels, predictions, weights='quadratic')}


# Load the model

In [7]:
# Model Setup
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=6)

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/724M [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-360M-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Weights Calculation for each classes

In [8]:
# Compute class weights for imbalanced data
class_counts = train['label'].value_counts()
total_samples = len(train)
class_weights = torch.tensor([total_samples / count for count in class_counts])
print(class_weights)

tensor([  2.7557,   3.6647,   4.4082,  13.8252,  17.8519, 110.6015])


# Loss fn 

In [9]:
# Define loss function with weights
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# Training
- Training Args
- Optimizer step up:AdamW
- Learning rate Scheduler
- Early stopping to avoid overfitting 

In [10]:
# Training Arguments with Gradient Clipping and Mixed Precision

training_args = TrainingArguments(
    output_dir="/kaggle/working/Smollm2_360M",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="quadratic_weighted_kappa",
    fp16=True,  # Mixed precision training
    max_grad_norm=1.0,  # Gradient clipping
)

# Optimizer and Scheduler Setup
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
total_steps = len(train_dataset) * training_args.num_train_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=total_steps
)

# Initialize Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset, 
    eval_dataset=valid_dataset,  
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [11]:
# Training 
trainer.train()

Epoch,Training Loss,Validation Loss,Quadratic Weighted Kappa
1,0.8893,0.875714,0.770981
2,0.7021,0.857909,0.79274
3,0.3151,1.380414,0.779519


TrainOutput(global_step=5517, training_loss=0.6690441616985576, metrics={'train_runtime': 12181.01, 'train_samples_per_second': 3.623, 'train_steps_per_second': 0.453, 'total_flos': 4.26550442950656e+16, 'train_loss': 0.6690441616985576, 'epoch': 3.0})

In [12]:
# Save model and tokenizer
model.save_pretrained("/kaggle/working/saved_model")
tokenizer.save_pretrained("/kaggle/working/saved_model")

print("Model and tokenizer saved to '/kaggle/working/saved_model'")

Model and tokenizer saved to '/kaggle/working/saved_model'


# Script for making predictions on test sets

In [13]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Custom Dataset for Prediction
class PredictionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['full_text']
        
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        
        return {'input_ids': input_ids, 'attention_mask': attention_mask}

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
tokenizer = AutoTokenizer.from_pretrained("./saved_model")

# Load test data
test_data = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv")

# Prepare test dataset
test_dataset = PredictionDataset(test_data, tokenizer)

# DataLoader for test set
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Perform inference
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predicted_labels = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        predicted_labels.extend(predictions)

# Shift back to 1-6 score
predicted_labels = np.array(predicted_labels) + 1

# Save predictions
test_data['score'] = predicted_labels
test_data[['essay_id', 'score']].to_csv('submission.csv', index=False)

print("Predictions saved to 'submission.csv'")

Predictions saved to 'submission.csv'
