In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# for data vis

import matplotlib.pyplot as plt
import matplotlib as mpl

# ! if  ! ( pip freeze | grep -E "datasets" ) ; then pip install datasets; fi
# %pip install bitsandbytes
# import bitsandbytes

# from datasets import Dataset

import torch 
from torch import nn, Tensor
from torch.utils.data import DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

from tqdm import tqdm #prog bar

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

MAX_LENGTH = 700

# Set Device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Read data

In [None]:
DATA_DIR = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"
df = pd.read_csv(DATA_DIR + "train.csv")
test_df = pd.read_csv(DATA_DIR + 'test.csv')

print("# Train Data: {:,}".format(len(df)))
print("# Sample:")
display(df.head(2))

# Data split

In [None]:
from sklearn.model_selection import train_test_split  # Import package

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["score"], random_state=42)

# EDA

In [None]:
cmap = mpl.cm.get_cmap('coolwarm')

# Show distribution of answers using a bar plot
plt.figure(figsize=(8, 4))
df.score.value_counts().plot.bar(color=[cmap(0.0), cmap(0.25), cmap(0.65), cmap(0.9), cmap(1.0)])
plt.xlabel("Score")
plt.ylabel("Count")
plt.title("Score distribution for Train Data")
plt.show()

# Show distribution of essay length using a bar plot
plt.figure(figsize=(8, 4))
df['essay_length'] = df.full_text.map(len)
df.essay_length.plot.hist(logy=False, color=cmap(0.9))
plt.xlabel("Essay Length")
plt.ylabel("Count")
plt.title("Essay Length distribution for Train Data")
plt.show()

# Load DebertaV3 Tokenizer

In [None]:
deberta_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

# Dataset

In [None]:
# print(Dataset.from_pandas(df).remove_columns(['essay_id', 'full_text']))

In [None]:
def tokenize(sample, tokenizer):
        return tokenizer(sample, max_length=MAX_LENGTH, padding=True, truncation=True)

## Set up Dataset for each model

In [None]:
df.head()

In [None]:
class TokenDataset(torch.utils.data.Dataset):
    
    # dataset constructor.
    def __init__(self, text_df, tokenize_func):
        encoding = tokenize_func(text_df["full_text"].tolist())
        self.input_ids = np.array(encoding.input_ids)
        self.attention_mask = np.array(encoding.attention_mask)
        self.scores = np.subtract(text_df["score"].to_numpy(), 1) # 0 index the scores
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx], self.scores[idx], idx
    
    def __len__(self):
        return len(self.input_ids)

In [None]:
# engessay_ds = TokenDataset(train_df, lambda sample : tokenize(sample, engessay_tokenizer))
# engessay_ds_val = TokenDataset(val_df, lambda sample : tokenize(sample, engessay_tokenizer))
deberta_ds = TokenDataset(train_df, lambda sample : tokenize(sample, deberta_tokenizer))
deberta_ds_val = TokenDataset(val_df, lambda sample : tokenize(sample, deberta_tokenizer))

In [None]:
# engessay_ds = Dataset.from_pandas(df).map(lambda sample : tokenize(sample, engessay_tokenizer)).remove_columns(['essay_id', 'attention_mask'])
# engessay_ds = engessay_ds.train_test_split(test_size=0.2, seed=42)
# deberta_ds = Dataset.from_pandas(df).map(lambda sample : tokenize(sample, deberta_tokenizer)).remove_columns(['essay_id', 'attention_mask'])
# deberta_ds = deberta_ds.train_test_split(test_size=0.2, seed=42)

# Define Metric QWK

## Quadratic Weighted Kappa Score

In [None]:
# thanks will x2
def quadratic_weighted_kappa(preds, actuals):
    penalty_by_diff = np.array([0, 0.04, 0.16, 0.36, 0.64, 1])
    return sum(penalty_by_diff[torch.abs(preds - actuals).to(torch.int32)])

In [None]:
def tensor_QWK(y_true: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor:
    assert y_true.shape == y_pred.shape
    y_true = y_true.cpu().detach()
    y_pred = y_pred.cpu().detach()
    return quadratic_weighted_kappa(y_true, y_pred)

# Training Deberta 

In [None]:
# Credits to Will!
class EMAMetric:
    def __init__(self, value = None, gamma = 0.95, sigfigs = 3):
        self.value = value
        self.gamma = gamma
        self.sigfigs = sigfigs
    def set(self, update):
        if self.value is None: 
            self.value = update
        else: 
            self.value = self.gamma * self.value + (1 - self.gamma) * update
        return self.value
    def __repr__(self):
        return str(round(self.value, self.sigfigs))

import collections
class History:
    def __init__(self):
        self.train_metrics = collections.defaultdict(list)
        self.val_metrics = collections.defaultdict(list)
    def append(self, **kwargs):
        assert type(kwargs) == dict
        assert all([isinstance(value, EMAMetric) for value in kwargs.values()]), "All values must be EMAMetric objects"
        for key, value in kwargs.items():
            if "val" in key:
                self.val_metrics[key].append(value.value)
            else:
                self.train_metrics[key].append(value.value)
    def plot(self):
        fig, axes = plt.subplots(2, figsize = (10, 10))
        sns.set()
        for key, value in self.train_metrics.items():
            sns.lineplot(value, label = key, ax = axes[0])
        for key, value in self.val_metrics.items():
            sns.lineplot(value, label = key, ax = axes[1])
        plt.legend()
        plt.show()


In [None]:
def train(clf, train_dataset, val_dataset, hyperparameters, optimizer, PATH):
    
    train_metrics = {
        "loss": EMAMetric(), "acc": EMAMetric(), "QWK": EMAMetric(),
    }
    val_metrics = {
        "val_loss": EMAMetric(), "val_acc": EMAMetric(), "val_QWK": EMAMetric()
    }
    history = History()
    
    EPOCHS = hyperparameters.get("epochs", 10)
    BATCH_SIZE = hyperparameters.get("batch_size", 32)

    train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
    val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = False)
    CELoss = torch.nn.CrossEntropyLoss()
    # scaler = torch.cuda.amp.GradScaler()
    
    best_val_loss = 999
    
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}")
        with tqdm(train_loader) as pbar:
            for input_ids, attention_mask, labels, _ in pbar:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                
                # all_logits = model(input_ids = input_ids, decoder_input_ids = decoder_input_ids.repeat(BATCH_SIZE, 1),
                #     output_hidden_states = False, output_attentions = False)["logits"]
                # num_logits = all_logits[:, -1,  tokenizer_elements]
                num_logits = clf(input_ids, attention_mask=attention_mask).logits
                optimizer.zero_grad()
                loss = CELoss(num_logits, labels)
                loss.backward()
                optimizer.step()
                acc = (torch.argmax(num_logits, dim = -1) == labels).to(torch.float32).mean()
                q_weighted_kappa = tensor_QWK(labels, torch.argmax(num_logits, dim = -1))
                train_metrics["loss"].set(loss.item())
                train_metrics["acc"].set(acc.item())
                train_metrics["QWK"].set(q_weighted_kappa)
                history.append(**train_metrics)
                desc = f"Loss: {train_metrics['loss']}, Accuracy: {train_metrics['acc']}, QWK: {train_metrics['QWK']}"
                pbar.set_description(desc)
        with torch.no_grad():
            with tqdm(val_loader) as pbar:
                for input_ids, attention_mask, labels, _ in pbar:
                    input_ids = input_ids.to(device)
                    attention_mask = attention_mask.to(device)
                    labels = labels.to(device)
                    num_logits = clf(input_ids, attention_mask=attention_mask).logits
                    loss = CELoss(num_logits, labels)
                    acc = (torch.argmax(num_logits, dim = -1) == labels).to(torch.float32).mean()
                    q_weighted_kappa = tensor_QWK(labels, torch.argmax(num_logits, dim = -1))
                    val_metrics["val_loss"].set(loss.item())
                    val_metrics["val_acc"].set(acc.item())
                    val_metrics["val_QWK"].set(q_weighted_kappa)
                    history.append(**val_metrics)
                    desc = f"Val Loss: {val_metrics['val_loss']}, Val Accuracy: {val_metrics['val_acc']}, Val QWK: {val_metrics['val_QWK']}"
                    pbar.set_description(desc)
        val_loss = val_metrics["val_loss"].value
        # save the model if the val loss is higher than best
        if(val_loss < best_val_loss):
            best_val_loss = val_loss

            torch.save({
                        'epoch': epoch,
                        'model_state_dict': clf.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': val_loss,
                        }, PATH)

    return history

### Run Train Loop

In [None]:

# Check if the checkpoint file exists
checkpoint_path = '/kaggle/input/deberta_e1/pytorch/epoch1/1/deberta_model.pt'
assert os.path.isfile(checkpoint_path), f"Checkpoint file '{checkpoint_path}' not found"

# Initialize the model and optimizer
deberta_model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=6).to(device)

# optimizer = torch.optim.AdamW(clf.parameters(), lr = 1e-5)
optimizer = torch.optim.AdamW(deberta_model.parameters(), lr = 1e-5)

# Load the checkpoint
checkpoint = torch.load(checkpoint_path, map_location=device)

# Restore the epoch
epoch = checkpoint['epoch']

# Restore the model state
deberta_model.load_state_dict(checkpoint['model_state_dict'])

# Restore the optimizer state
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Restore the loss
val_loss = checkpoint['loss']

print(f"Checkpoint loaded: epoch {epoch}, loss {val_loss}")

In [None]:
hyperparameters = {
    'epochs': 16,
    'batch_size': 8,
}

In [None]:
history = train(deberta_model, deberta_ds, deberta_ds_val, hyperparameters=hyperparameters, optimizer=optimizer, PATH="deberta_model.pt")
history.plot()

## Define Features Dataset with Engessay model breakdowns

In [None]:
# import torch

# class FeaturesDataset(torch.utils.data.Dataset):
#     def __init__(self, breakdown, scores):
#         self.breakdown = breakdown
#         self.scores = np.array(scores)
    
#     def __getitem__(self, index):
#         breakdown = torch.tensor(self.breakdown[index])  # Convert to tensor
#         score = self.scores[index]
#         return breakdown, score
 
#     def __len__(self):
#         return len(self.breakdown)

In [None]:
# def run_inference(model, dataset, batch_size=32):
#     device = next(model.parameters()).device
#     model.eval()
    
#     all_breakdowns = []
#     all_indices = []
    
#     dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    
#     with torch.no_grad():
#         for input_ids, attention_mask, _, indices in tqdm(dataloader):
#             input_ids = input_ids.to(device)
#             attention_mask = attention_mask.to(device)
            
#             outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#             breakdowns = outputs.logits  # Assuming the model outputs the linear layer
            
#             all_breakdowns.extend(breakdowns.cpu().numpy())
#             all_indices.extend(indices.numpy())
    
#     sorted_indices, sorted_breakdowns = zip(*sorted(zip(all_indices, all_breakdowns)))
    
#     return np.array(sorted_breakdowns)

## Get Engessay predictions

In [None]:
# def run_inference_and_save_to_csv(model, dataset, output_csv_file):
#     # Run inference to get breakdown features
#     breakdown_features = run_inference(model, dataset)
    
#     # Convert to DataFrame
#     breakdown_df = pd.DataFrame(breakdown_features)
    
#     # Save DataFrame to CSV
#     breakdown_df.to_csv(output_csv_file, index=False)
    
#     return breakdown_features

In [None]:
# def load_or_run_inference_and_save_to_csv(model, dataset, output_csv_file):
#     # Check if the CSV file exists
#     if not os.path.exists(output_csv_file):
#         # If the CSV file doesn't exist, run inference and save to CSV
#         run_inference_and_save_to_csv(model, dataset, output_csv_file)
#     else:
#         # If the CSV file exists, load breakdown features from CSV
#         breakdown_features = load_breakdown_features_from_csv(output_csv_file)
#         return breakdown_features

In [None]:
# breakdown_features = load_or_run_inference_and_save_to_csv(engessay_model, engessay_ds, "/kaggle/input/aes2-breakdown-features/breakdown_features.csv")

# # Create FeaturesDataset
# features_dataset = FeaturesDataset(breakdown=breakdown_features, scores=train_df["score"])
# del breakdown_features

# # Run inference to get breakdown features
# breakdown_features_val = load_or_run_inference_and_save_to_csv(engessay_model, engessay_ds_val, "/kaggle/input/aes2-breakdown-features/breakdown_features_val.csv")

# # Create FeaturesDataset
# features_dataset_val = FeaturesDataset(breakdown=breakdown_features_val, scores=val_df["score"])
# del breakdown_features_val

### Take a look at the features

In [None]:
# # Select a subset of indices to visualize (e.g., the first 5)
# indices_to_visualize = range(5)

# # Iterate through the subset of indices and print the corresponding data
# for idx in indices_to_visualize:
#     # Get data from FeaturesDataset
#     breakdown, score = features_dataset[idx]
    
#     # Get full text from train_df
#     full_text = train_df.iloc[idx]["full_text"]
    
#     # Print the data side by side
#     print(f"Index: {idx}")
#     print("Breakdown features:", breakdown)
#     print("Score:", score)
#     print("Full text:", full_text)
#     print("===========================================")

In [None]:

# # Define the number of features
# num_features = len(features_dataset.breakdown[0])

# import math

# # Calculate the number of rows and columns for subplots
# num_rows = math.ceil(num_features / 3)
# num_cols = min(num_features, 3)

# feature_names = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar",  "conventions"]

# # Create subplots
# fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))

# # Iterate over features
# for i in range(num_features):
#     row_idx = i // num_cols
#     col_idx = i % num_cols
    
#     feature_values = []
#     for data in features_dataset:
#         breakdown, score = data
#         feature_value = breakdown[i].item()  # Get the value of the i-th feature
#         feature_values.append(feature_value)
    
#     # Plot scatter plot
#     ax = axes[row_idx, col_idx] if num_rows > 1 else axes[col_idx]
#     ax.scatter(feature_values, features_dataset.scores, alpha=0.01)
#     ax.set_title(f"Correlation between Feature {feature_names[i]} and Score")
#     ax.set_xlabel(f"Feature {feature_names[i]} Value")
#     ax.set_ylabel("Score")

# # Hide empty subplots
# for i in range(num_features, num_rows*num_cols):
#     row_idx = i // num_cols
#     col_idx = i % num_cols
#     ax = axes[row_idx, col_idx] if num_rows > 1 else axes[col_idx]
#     ax.axis('off')

# plt.tight_layout()
# plt.show()


In [None]:
# # Convert data to numpy arrays
# feature_values = np.array(features_dataset.breakdown)
# scores_array = np.array(features_dataset.scores)

# # print(feature_values.shape)
# # print(scores_array.shape)

# # Concatenate features and scores into a single array for each data point
# data_array = np.concatenate((feature_values, scores_array[:, np.newaxis]), axis=1)
# # print(data_array.shape)

# # Calculate correlation matrix
# correlation_matrix = np.corrcoef(data_array.T)

# # print(correlation_matrix)

# # Plot heatmap
# plt.figure(figsize=(8, 2))
# plt.imshow(correlation_matrix[:-1, -1][:, np.newaxis].T, cmap='viridis', aspect='auto')
# plt.colorbar(label='Correlation')
# plt.title("Correlation between Features and Score")
# plt.xlabel("Features")
# plt.ylabel("Score")
# plt.xticks(np.arange(feature_values.shape[1]), feature_names, rotation=45, ha='right')
# plt.yticks([])
# plt.show()

# # Prepare the correlation coefficients data
# correlation_data = [[feature_name, correlation_matrix[:-1, -1][i]] for i, feature_name in enumerate(feature_names)]

# # Print the correlation coefficients table
# print("Feature              | Correlation with Score")
# print("---------------------------------------------------")
# for feature, correlation in correlation_data:
#     print(f"{feature:<20} | {correlation:.4f}")