In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# for data vis

import matplotlib.pyplot as plt
import matplotlib as mpl

import seaborn as sns

# ! if  ! ( pip freeze | grep -E "datasets" ) ; then pip install datasets; fi
# %pip install bitsandbytes
# import bitsandbytes

# from datasets import Dataset

import torch 
from torch import nn, Tensor
from torch.utils.data import DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

from tqdm import tqdm #prog bar

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

MAX_LENGTH = 700

# Set Device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Read data

In [None]:
DATA_DIR = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"
df = pd.read_csv(DATA_DIR + "train.csv")
test_df = pd.read_csv(DATA_DIR + 'test.csv')

print("# Train Data: {:,}".format(len(df)))
print("# Sample:")
display(df.head(2))

# Data split

In [None]:
from sklearn.model_selection import train_test_split  # Import package

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["score"], random_state=42)

In [None]:
cmap = mpl.cm.get_cmap('coolwarm')

# Show distribution of answers using a bar plot
plt.figure(figsize=(8, 4))
df.score.value_counts().plot.bar(color=[cmap(0.0), cmap(0.25), cmap(0.65), cmap(0.9), cmap(1.0)])
plt.xlabel("Score")
plt.ylabel("Count")
plt.title("Score distribution for Train Data")
plt.show()

# Show distribution of essay length using a bar plot
plt.figure(figsize=(8, 4))
df['essay_length'] = df.full_text.map(len)
df.essay_length.plot.hist(logy=False, color=cmap(0.9))
plt.xlabel("Essay Length")
plt.ylabel("Count")
plt.title("Essay Length distribution for Train Data")
plt.show()

# Use Kevintu/Engessay_grading_ML for intermediate metrics

In [None]:
access_token = "<INSERT YOUR HF ACCESS TOKEN>"
engessay_model = AutoModelForSequenceClassification.from_pretrained("Kevintu/Engessay_grading_ML", token=access_token).to(device)
engessay_tokenizer = AutoTokenizer.from_pretrained("Kevintu/Engessay_grading_ML", token=access_token)
'''
new_text = "The English Language Learner Insight, Proficiency and Skills Evaluation (ELLIPSE) Corpus is a freely available corpus of ~6,500 ELL writing samples that have been scored for overall holistic language proficiency as well as analytic proficiency scores related to cohesion, syntax, vocabulary, phraseology, grammar, and conventions. In addition, the ELLIPSE corpus provides individual and demographic information for the ELL writers in the corpus including economic status, gender, grade level (8-12), and race/ethnicity. The corpus provides language proficiency scores for individual writers and was developed to advance research in corpus and NLP approaches to assess overall and more fine-grained features of proficiency."

encoded_input = tokenizer(new_text, return_tensors='pt', padding=True, truncation=True, max_length=64)
model.eval()

# Perform the prediction
with torch.no_grad():
    outputs = model(**encoded_input)

predictions = outputs.logits.squeeze()

predicted_scores = predictions.numpy()
item_names = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar",  "conventions"]
for item, score in zip(item_names, predicted_scores):
    print(f"{item}: {score:.4f}")
'''

##"output" (values raning from 1 to 5):
#cohesion: 3.5399
#syntax: 3.6380
#vocabulary: 3.9250
#phraseology: 3.8381
#grammar: 3.9194
#conventions: 3.6819

# Load DebertaV3

In [None]:
deberta_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

# check for model weights

# Check if the checkpoint file exists
checkpoint_path = '/kaggle/input/deberta_e1/pytorch/2/1/deberta_model (1).pt'
assert os.path.isfile(checkpoint_path), f"Checkpoint file '{checkpoint_path}' not found"

# Initialize the model and optimizer
deberta_model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=6).to(device)

# optimizer = AdamW(deberta_model.parameters())

# Load the checkpoint
checkpoint = torch.load(checkpoint_path, map_location=device)

# Restore the epoch
epoch = checkpoint['epoch']

# Restore the model state
deberta_model.load_state_dict(checkpoint['model_state_dict'])

# Restore the optimizer state
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Restore the loss
val_loss = checkpoint['loss']

print(f"Checkpoint loaded: epoch {epoch}, loss {val_loss}")

## Set up Dataset for each model

In [None]:
df.head()

In [None]:
def tokenize(sample, tokenizer, max_length=MAX_LENGTH):
        return tokenizer(sample, max_length=max_length, padding=True, truncation=True)

In [None]:
class TokenDataset(torch.utils.data.Dataset):
    
    # dataset constructor.
    def __init__(self, text_df, tokenize_func):
        encoding = tokenize_func(text_df["full_text"].tolist())
        self.input_ids = np.array(encoding.input_ids)
        self.attention_mask = np.array(encoding.attention_mask)
        self.scores = np.subtract(text_df["score"].to_numpy(), 1) # 0 index the scores
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx], self.scores[idx], idx
    
    def __len__(self):
        return len(self.input_ids)

In [None]:
engessay_ds = TokenDataset(train_df, lambda sample : tokenize(sample, engessay_tokenizer, max_length=512))
engessay_ds_val = TokenDataset(val_df, lambda sample : tokenize(sample, engessay_tokenizer, max_length=512))
deberta_ds = TokenDataset(train_df, lambda sample : tokenize(sample, deberta_tokenizer))
deberta_ds_val = TokenDataset(val_df, lambda sample : tokenize(sample, deberta_tokenizer))

# Define Metric QWK

## Quadratic Weighted Kappa Score

In [None]:
# thanks will x2
def quadratic_weighted_kappa(preds, actuals):
    penalty_by_diff = np.array([0, 0.04, 0.16, 0.36, 0.64, 1])
    return sum(penalty_by_diff[torch.abs(preds - actuals).to(torch.int32)])

In [None]:
def tensor_QWK(y_true: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor:
    assert y_true.shape == y_pred.shape
    y_true = y_true.cpu().detach()
    y_pred = y_pred.cpu().detach()
    return quadratic_weighted_kappa(y_true, y_pred)

## Define Features Dataset with Engessay model breakdowns

In [None]:
import torch

class FeaturesDataset(torch.utils.data.Dataset):
    def __init__(self, breakdown, scores):
        self.breakdown = breakdown
        self.scores = np.array(scores)
    
    def __getitem__(self, index):
        breakdown = torch.tensor(self.breakdown[index])  # Convert to tensor
        score = self.scores[index]
        return breakdown, score
 
    def __len__(self):
        return len(self.breakdown)

In [None]:
def run_inference(model, dataset, batch_size=32):
    device = next(model.parameters()).device
    model.eval()
    
    all_breakdowns = []
    all_indices = []
    
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    
    with torch.no_grad():
        for input_ids, attention_mask, _, indices in tqdm(dataloader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            breakdowns = outputs.logits  # Assuming the model outputs the linear layer
            
            all_breakdowns.extend(breakdowns.cpu().numpy())
            all_indices.extend(indices.numpy())
    
    sorted_indices, sorted_breakdowns = zip(*sorted(zip(all_indices, all_breakdowns)))
    
    return np.array(sorted_indices), np.array(sorted_breakdowns)

In [None]:
def combine_breakdowns(deberta_breakdowns, engessay_breakdowns, deberta_indices, engessay_indices):
    # Combine breakdowns and indices
    combined_breakdowns = []
    combined_indices = []

    # Assuming indices for deberta_ds and engessay_ds are aligned
    assert (deberta_indices == engessay_indices).all(), "Indices for deberta_ds and engessay_ds should be aligned"

    for deberta_breakdown, engessay_breakdown, index in zip(deberta_breakdowns, engessay_breakdowns, deberta_indices):
        combined_breakdown = np.concatenate((deberta_breakdown, engessay_breakdown))
        combined_breakdowns.append(combined_breakdown)
        combined_indices.append(index)

    return np.array(combined_breakdowns), np.array(combined_indices)


In [None]:
engessay_feature_names = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar",  "conventions"]

In [None]:
engessay_indices, engessay_breakdowns = run_inference(engessay_model, engessay_ds, batch_size=32)
deberta_indices, deberta_breakdowns = run_inference(deberta_model, deberta_ds, batch_size=25)

In [None]:
engessay_indices_val, engessay_breakdowns_val = run_inference(engessay_model, engessay_ds_val, batch_size=32)
deberta_indices_val, deberta_breakdowns_val = run_inference(deberta_model, deberta_ds_val, batch_size=25)

In [None]:
DF_PATH = "train_features.csv"
def create_features_df(engessay_indices, engessay_breakdowns, deberta_indices, deberta_breakdowns, scores, DF_PATH):

    combined_breakdowns, combined_indices = combine_breakdowns(deberta_breakdowns, engessay_breakdowns, deberta_indices, engessay_indices)

    combined_data = np.concatenate((combined_breakdowns, scores.reshape(-1, 1)), axis=1)

    # Convert combined data and indices to DataFrame
    df = pd.DataFrame(combined_data, index=combined_indices)

    num_deberta_features = deberta_breakdowns.shape[1]
    column_names = [f"deberta_feature_{i+1}" for i in range(num_deberta_features)] + \
                   engessay_feature_names + \
                   ["scores"]
    df.columns = column_names
    
    # Save DataFrame to CSV
    df.to_csv(DF_PATH)
    
    return df


In [None]:
train_features_df = create_features_df(engessay_indices, engessay_breakdowns, deberta_indices, deberta_breakdowns, scores=train_df["score"].to_numpy(), DF_PATH="train_features.csv")

In [None]:
val_features_df = create_features_df(engessay_indices_val, engessay_breakdowns_val, deberta_indices_val, deberta_breakdowns_val, scores=val_df["score"].to_numpy(), DF_PATH="val_features.csv")

In [None]:
# train_features_df = create_features_df(deberta_model, deberta_ds, engessay_model, engessay_ds, scores=train_df["score"], DF_PATH="train_features.csv")
# val_features_df = create_features_df(deberta_model, deberta_ds_val, engessay_model, engessay_ds_val, scores=val_df["score"], DF_PATH="val_features.csv")

In [None]:
# Define function to create FeaturesDataset from DataFrame
def create_features_dataset_from_df(df):
    # Extract breakdowns and scores from DataFrame
    combined_breakdowns = df.iloc[:, :-1].values  # Exclude the last column (scores)
    scores = df["scores"].values

    # Create FeaturesDataset object
    features_dataset = FeaturesDataset(combined_breakdowns, scores)
    return features_dataset


In [None]:
train_features_df.head()

In [None]:
# Load train and validation DataFrame
# train_features_df = pd.read_csv("train_features.csv", index_col=0)
# val_features_df = pd.read_csv("val_features.csv", index_col=0)

# Create FeaturesDataset objects for train and validation sets
train_features_ds = create_features_dataset_from_df(train_features_df)
val_features_ds = create_features_dataset_from_df(val_features_df)

### Take a look at the features

In [None]:
features_dataset = val_features_ds

In [None]:
print(features_dataset.breakdown.shape)

In [None]:
print(len(engessay_ds))

In [None]:
print(train_df["score"].shape)

In [None]:
train_df["score"].head()

In [None]:
# Select a subset of indices to visualize (e.g., the first 5)
indices_to_visualize = range(5)

# Iterate through the subset of indices and print the corresponding data
for idx in indices_to_visualize:
    # Get data from FeaturesDataset
    breakdown, score = features_dataset[idx]
    
    # Get full text from train_df
    full_text = train_df.iloc[idx]["full_text"]
    
    # Print the data side by side
    print(f"Index: {idx}")
    print("Breakdown features:", breakdown)
    print("Score:", score)
    print("Full text:", full_text)
    print("===========================================")

In [None]:

# Define the number of features
num_features = len(features_dataset.breakdown[0])
num_deberta_features = 6
import math

# Calculate the number of rows and columns for subplots
num_rows = math.ceil(num_features / 3)
num_cols = min(num_features, 3)

feature_names = [f"deberta_feature_{i+1}" for i in range(num_deberta_features)] + \
                   engessay_feature_names

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))

# Iterate over features
for i in range(num_features):
    row_idx = i // num_cols
    col_idx = i % num_cols
    
    feature_values = []
    for data in features_dataset:
        breakdown, score = data
        feature_value = breakdown[i].item()  # Get the value of the i-th feature
        feature_values.append(feature_value)
    
    # Plot scatter plot
    ax = axes[row_idx, col_idx] if num_rows > 1 else axes[col_idx]
    ax.scatter(features_dataset.scores, feature_values, alpha=0.05)
    ax.set_title(f"Correlation between Feature {feature_names[i]} and Score")
    ax.set_ylabel(f"Feature {feature_names[i]} Value")
    ax.set_xlabel("Score")

# Hide empty subplots
for i in range(num_features, num_rows*num_cols):
    row_idx = i // num_cols
    col_idx = i % num_cols
    ax = axes[row_idx, col_idx] if num_rows > 1 else axes[col_idx]
    ax.axis('off')

plt.tight_layout()
plt.show()


In [None]:
# Convert data to numpy arrays
feature_values = np.array(features_dataset.breakdown)
scores_array = np.array(features_dataset.scores)

# print(feature_values.shape)
# print(scores_array.shape)

# Concatenate features and scores into a single array for each data point
data_array = np.concatenate((feature_values, scores_array[:, np.newaxis]), axis=1)
# print(data_array.shape)

# Calculate correlation matrix
correlation_matrix = np.corrcoef(data_array.T)

# print(correlation_matrix)

# Plot heatmap
plt.figure(figsize=(8, 2))
plt.imshow(correlation_matrix[:-1, -1][:, np.newaxis].T, cmap='viridis', aspect='auto')
plt.colorbar(label='Correlation')
plt.title("Correlation between Features and Score")
plt.xlabel("Features")
plt.ylabel("Score")
plt.xticks(np.arange(feature_values.shape[1]), feature_names, rotation=45, ha='right')
plt.yticks([])
plt.show()

# Prepare the correlation coefficients data
correlation_data = [[feature_name, correlation_matrix[:-1, -1][i]] for i, feature_name in enumerate(feature_names)]

# Print the correlation coefficients table
print("Feature              | Correlation with Score")
print("---------------------------------------------------")
for feature, correlation in correlation_data:
    print(f"{feature:<20} | {correlation:.4f}")