## Sources
- Regressor idea: https://www.kaggle.com/code/tsunotsuno/updated-debertav3-lgbm-with-feature-engineering
### Previous notebook:
- https://www.kaggle.com/code/josemariasabater/commonlit-roberta-base-with-prompts/edit/run/139550119

In [124]:
!pip3 install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

[0mProcessing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
[31mERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: '/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl'
[0m[31m
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m

## Imports and Settings

In [125]:
import numpy as np
import pandas as pd
import warnings
import os
import shutil
import logging
import json
import transformers
from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
)
from transformers import DataCollatorWithPadding
from datasets import Dataset, load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch

from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from collections import Counter
import spacy
import pandas as pd
from spellchecker import SpellChecker
import re
import string

%load_ext lab_black

# logging settings

# warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
# disable_progress_bar()
tqdm.pandas()

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [126]:
# set random seed
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(seed=42)

### Config class

In [127]:
class CFG:
    model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
    # model_name = "/kaggle/input/debertav3base"
    learning_rate = 1.2e-5
    weight_decay = 0.02
    hidden_dropout_prob = 0.1
    attention_probs_dropout_prob = 0.01
    num_train_epochs = 3
    n_splits = 4
    batch_size = 8
    random_seed = 42
    save_steps = 100
    max_length = 512
    use_prompts = False
    warmup_ratio = 0.01

## Load Data

In [128]:
# DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

# prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
# prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
# summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
# summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
# sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

# Local

DATA_DIR = "./data/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

In [191]:
# For testing
random_nr = np.random.randint(0, len(prompts_train))
merged_train = pd.merge(summaries_train, prompts_train, how="left", on="prompt_id")
example1 = merged_train.iloc[random_nr].copy()
example1

student_id                                              005ab0199905
prompt_id                                                     3b9047
text               The highest class was Pharaohs these people we...
content                                                    -0.210614
wording                                                    -0.471415
prompt_question    In complete sentences, summarize the structure...
prompt_title                               Egyptian Social Structure
prompt_text        Egyptian society was structured like a pyramid...
Name: 3, dtype: object

## Preprocessing Class

### Ideas
Overlaps
Quotes
Length of Summary vs Length of text
Grammar mistakes
Repeated vocabulary inside the summary

In [217]:
class Preprocessor:
    def __init__(self, model_name: str) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # Spacy NER count
        self.spacy_ner_model = spacy.load(
            "en_core_web_sm",
        )
        self.speller = SpellChecker()
        self.STOP_WORDS = set(stopwords.words("english"))

    def count_text_length(self, df: pd.DataFrame, column: str) -> pd.Series:
        return df[column].progress_apply(lambda x: len(self.tokenizer.encode(x)))

    def non_stop_word_overlap(self, row: pd.Series) -> float:
        """intersection(prompt_text, text) after removing stop words"""

        def check_is_stop_word(word):
            normalized_word = word.lower().strip("▁")
            return (
                normalized_word not in self.STOP_WORDS
                and normalized_word not in string.punctuation
            )

        prompt_words = row["prompt_tokens"]
        summary_words = row["summary_tokens"]
        # Remove stop words
        prompt_words = list(filter(check_is_stop_word, prompt_words))
        summary_words = list(filter(check_is_stop_word, summary_words))

        return len(set(prompt_words).intersection(set(summary_words)))

    def ngrams(self, input_list: list, n: int) -> list[str]:
        """Returns a list of ngrams"""
        ngrams_ = zip(*[input_list[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams_]

    def get_ngram_overlap(self, row: pd.Series, n: int) -> float:
        """Returns the ngram overlap between prompt and summary"""
        summary_ngrams = self.ngrams(row["summary_tokens"], n)
        prompt_ngrams = self.ngrams(row["prompt_tokens"], n)

        return len(set(summary_ngrams).intersection(set(prompt_ngrams)))

    def get_ner_overlap(self, row: pd.Series) -> float:
        """Returns the number of overlapping named entities between prompt and summary"""
        prompt_doc = self.spacy_ner_model(row["prompt_text"])
        summary_doc = self.spacy_ner_model(row["text"])

        prompt_entities = set([ent.text.lower() for ent in prompt_doc.ents])
        summary_entities = set([ent.text.lower() for ent in summary_doc.ents])

        return len(prompt_entities.intersection(summary_entities))

    def get_spelling_error_count(self, row: pd.Series) -> float:
        """Returns the number of spelling errors in the summary"""
        summary_text = row["text"]
        text = "".join(char for char in summary_text if char not in string.punctuation)
        misspelled = self.speller.unknown(text.split())
        return len(misspelled)

    def run(self, prompts: pd.DataFrame, summaries: pd.DataFrame) -> pd.DataFrame:
        # Tokenize

        tqdm.pandas(desc="Tokenizing Prompts")
        prompts["prompt_tokens"] = prompts["prompt_text"].progress_apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), skip_special_tokens=True
            )
        )
        tqdm.pandas(desc="Tokenizing Summaries")
        summaries["summary_tokens"] = summaries["text"].progress_apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), skip_special_tokens=True
            )
        )

        merged_df = pd.merge(summaries, prompts, how="left", on="prompt_id")

        # Count text length

        merged_df["prompt_length"] = self.count_text_length(merged_df, "prompt_text")
        merged_df["summary_length"] = self.count_text_length(merged_df, "text")

        # Count non-stop word overlap
        tqdm.pandas(desc="Counting non-stop word overlap")
        merged_df["non_stop_word_overlap"] = merged_df.progress_apply(
            self.non_stop_word_overlap, axis=1
        )

        # Count ngram overlap
        tqdm.pandas(desc="Counting unigram overlap")
        merged_df["unigram_overlap"] = merged_df.progress_apply(
            lambda x: self.get_ngram_overlap(x, 1), axis=1
        )
        tqdm.pandas(desc="Counting bigram overlap")
        merged_df["bigram_overlap"] = merged_df.progress_apply(
            lambda x: self.get_ngram_overlap(x, 2), axis=1
        )
        tqdm.pandas(desc="Counting trigram overlap")
        merged_df["trigram_overlap"] = merged_df.progress_apply(
            lambda x: self.get_ngram_overlap(x, 3), axis=1
        )

        # Count named entity overlap
        tqdm.pandas(desc="Counting named entity overlap")
        merged_df["ner_overlap"] = merged_df.progress_apply(
            self.get_ner_overlap, axis=1
        )

        # Count spelling errors
        tqdm.pandas(desc="Counting spelling errors")
        merged_df["spelling_error_count"] = merged_df.progress_apply(
            self.get_spelling_error_count, axis=1
        )

        # Summary/Prompt token length ratio
        merged_df["token_length_ratio"] = (
            merged_df["summary_length"] / merged_df["prompt_length"]
        )
        return merged_df


Preprocessor = Preprocessor(CFG.model_name)



In [218]:
train = Preprocessor.run(prompts_train, summaries_train)
test = Preprocessor.run(prompts_test, summaries_test)

Tokenizing Prompts: 100%|██████████| 4/4 [00:00<00:00, 59.89it/s]
Tokenizing Summaries: 100%|██████████| 7165/7165 [00:09<00:00, 740.84it/s]
Tokenizing Summaries: 100%|██████████| 7165/7165 [00:20<00:00, 356.87it/s]
Tokenizing Summaries: 100%|██████████| 7165/7165 [00:02<00:00, 2500.07it/s]
Counting non-stop word overlap: 100%|██████████| 7165/7165 [00:03<00:00, 2373.22it/s]
Counting unigram overlap: 100%|██████████| 7165/7165 [00:00<00:00, 8206.76it/s]
Counting bigram overlap: 100%|██████████| 7165/7165 [00:01<00:00, 4380.85it/s]
Counting trigram overlap: 100%|██████████| 7165/7165 [00:01<00:00, 3810.09it/s]
Counting named entity overlap: 100%|██████████| 7165/7165 [10:34<00:00, 11.30it/s]
Counting spelling errors: 100%|██████████| 7165/7165 [00:01<00:00, 3934.71it/s]
Tokenizing Prompts: 100%|██████████| 2/2 [00:00<00:00, 1396.70it/s]
Tokenizing Summaries: 100%|██████████| 4/4 [00:00<00:00, 2217.16it/s]
Tokenizing Summaries: 100%|██████████| 4/4 [00:00<00:00, 2533.17it/s]
Tokenizing S

In [219]:
train

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_tokens,prompt_question,prompt_title,prompt_text,prompt_tokens,prompt_length,summary_length,non_stop_word_overlap,unigram_overlap,bigram_overlap,trigram_overlap,ner_overlap,spelling_error_count,token_length_ratio
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,"[▁The, ▁third, ▁wave, ▁was, ▁an, ▁experiment, ...",Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,"[▁Background, ▁The, ▁Third, ▁Wave, ▁experiment...",671,69,9,26,5,0,2,2,0.102832
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,"[▁They, ▁would, ▁rub, ▁it, ▁up, ▁with, ▁soda, ...",Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...","[▁With, ▁one, ▁member, ▁trimming, ▁beef, ▁in, ...",1137,56,14,34,22,10,0,0,0.049252
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"[▁In, ▁Egypt, ,, ▁there, ▁were, ▁many, ▁occupa...","In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,"[▁Egyptian, ▁society, ▁was, ▁structured, ▁like...",651,285,54,84,56,26,5,3,0.437788
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"[▁The, ▁highest, ▁class, ▁was, ▁Pharaoh, s, ▁t...","In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,"[▁Egyptian, ▁society, ▁was, ▁structured, ▁like...",651,43,10,19,10,6,0,4,0.066052
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,"[▁The, ▁Third, ▁Wave, ▁developed, ▁rapidly, ▁b...",Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,"[▁Background, ▁The, ▁Third, ▁Wave, ▁experiment...",671,253,29,58,27,5,3,11,0.377049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,They used all sorts of chemical concoctions to...,0.205683,0.380538,"[▁They, ▁used, ▁all, ▁sorts, ▁of, ▁chemical, ▁...",Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...","[▁With, ▁one, ▁member, ▁trimming, ▁beef, ▁in, ...",1137,78,18,37,40,34,0,0,0.068602
7161,ffc34d056498,3b9047,The lowest classes are slaves and farmers slav...,-0.308448,0.048171,"[▁The, ▁lowest, ▁classes, ▁are, ▁slaves, ▁and,...","In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,"[▁Egyptian, ▁society, ▁was, ▁structured, ▁like...",651,56,14,24,6,1,0,2,0.086022
7162,ffd1576d2e1b,3b9047,they sorta made people start workin...,-1.408180,-0.493603,"[▁they, ▁sorta, ▁made, ▁people, ▁start, ▁worki...","In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,"[▁Egyptian, ▁society, ▁was, ▁structured, ▁like...",651,66,12,24,7,1,1,8,0.101382
7163,ffe4a98093b2,39c16e,An ideal tragety has three elements that make ...,-0.393310,0.627128,"[▁An, ▁ideal, ▁trag, ety, ▁has, ▁three, ▁eleme...",Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,"[▁Chapter, ▁13, ▁As, ▁the, ▁sequel, ▁to, ▁what...",721,66,10,24,4,0,0,1,0.091540


### Metrics functions

In [1]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}


def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }


def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred) ** (1 / 2)
    wording_score = mean_squared_error(wording_true, wording_pred) ** (1 / 2)

    return (content_score + wording_score) / 2

## Train LLM

In [None]:
class ContentScoreRegressor:
    def __init__(self,
                model_name:str,
                model_dir:str,
                target:list[str],
                hidden_dropout_prob:float,
                attention_probs_dropout_prob:float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "text"]
        self.input_col = "input"

        self.target = target
        self.target_cols = [target]

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length

        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{model_name}")
        self.model_config = AutoConfig.from_pretrained(f"/kaggle/input/{model_name}")
        self.model_config.update(
        {
            "hidden_dropout_prob": CFG.hidden_dropout_prob,  
            "attention_probs_dropout_prob": CFG.attention_probs_dropout_prob,
            "num_labels": 2,
            "problem_type": "regression",
        })

        seed_everything(seed=42)

    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                            padding=False,
                            truncation=True,
                            max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
    
    def train(self,
            fold:int,
            train_df:pd.DataFrame, 
            val_df:pd.DataFrame,
            batch_size:int,
            learning_rate:float,
            weight_decay:float,
            num_train_epochs: float,
            save_steps: int)-> None:
        sep = self.tokenizer.sep_token
        train_df[self.input_col] = (train_df["prompt_title"] + sep +
                                    train_df["prompt_question"] + sep +
                                    train_df["text"])
        val_df[self.input_col] = (val_df["prompt_title"] + sep +
                            val_df["prompt_question"] + sep +
                            val_df["text"])
        
        

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
example1["summary_tokens"] = tokenizer.convert_ids_to_tokens(
    tokenizer.encode(example1["text"]), skip_special_tokens=True
)
example1["prompt_tokens"] = tokenizer.convert_ids_to_tokens(
    tokenizer.encode(example1["prompt_text"]), skip_special_tokens=True
)


def word_overlap_count(row):
    """intersection(prompt_text, text)"""
    STOP_WORDS = set(stopwords.words("english"))

    def check_is_stop_word(word):
        normalized_word = word.lower().strip("▁")
        return (
            normalized_word not in STOP_WORDS
            and normalized_word not in string.punctuation
        )

    prompt_words = row["prompt_tokens"]
    summary_words = row["summary_tokens"]
    print(f"Raw prompts({len(prompt_words)}):", prompt_words)
    print("#" * 20)
    print(f"Raw Summary({len(summary_words)}):", summary_words)
    prompt_words = list(filter(check_is_stop_word, prompt_words))
    print(f"Filtered prompts ({len(prompt_words)})", prompt_words)
    print("#" * 20)
    summary_words = list(filter(check_is_stop_word, summary_words))
    print(f"Filtered Summary ({len(summary_words)})", summary_words)
    return (
        len(set(prompt_words).intersection(set(summary_words))),
        set(prompt_words).intersection(set(summary_words)),
    )


print("Word Overlap Count:", word_overlap_count(example1))



Raw prompts(649): ['▁Egyptian', '▁society', '▁was', '▁structured', '▁like', '▁a', '▁pyramid', '.', '▁At', '▁the', '▁top', '▁were', '▁the', '▁gods', ',', '▁such', '▁as', '▁Ra', ',', '▁Osiris', ',', '▁and', '▁Isis', '.', '▁Egyptians', '▁believed', '▁that', '▁the', '▁gods', '▁controlled', '▁the', '▁universe', '.', '▁Therefore', ',', '▁it', '▁was', '▁important', '▁to', '▁keep', '▁them', '▁happy', '.', '▁They', '▁could', '▁make', '▁the', '▁Nile', '▁overflow', ',', '▁cause', '▁famine', ',', '▁or', '▁even', '▁bring', '▁death', '.', '▁The', '▁Egyptians', '▁also', '▁elevated', '▁some', '▁human', '▁beings', '▁to', '▁gods', '.', '▁Their', '▁leaders', ',', '▁called', '▁pharaoh', 's', ',', '▁were', '▁believed', '▁to', '▁be', '▁gods', '▁in', '▁human', '▁form', '.', '▁They', '▁had', '▁absolute', '▁power', '▁over', '▁their', '▁subjects', '.', '▁After', '▁pharaoh', 's', '▁died', ',', '▁huge', '▁stone', '▁pyramids', '▁were', '▁built', '▁as', '▁their', '▁tombs', '.', '▁Pharaoh', 's', '▁were', '▁buried', 