In [1]:
!pip install "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"
!pip install /kaggle/input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl
!pip install /kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

Processing /kaggle/input/autocorrect/autocorrect-2.6.1.tar
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25ldone
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622383 sha256=20aa39cf5c7a42adb1e267f7c4a1a66281f2b7bf06e444ee90ab9d181bb96843
  Stored in directory: /root/.cache/pip/wheels/db/69/42/0fb0421d2fe70d195a04665edc760cfe5fd341d7bb8d8e0aaa
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1
Processing /kaggle/input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl
Installing collected packages: Pyphen
Successfully installed Pyphen-0.9.3
Processing /kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl
Installing collected packages: textstat
Successfully installed textstat-0.7.0
Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing col

In [2]:
import pandas as pd
import numpy as np
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

import textstat

tokenizers.__version__: 0.13.3
transformers.__version__: 4.30.2
env: TOKENIZERS_PARALLELISM=True
cuda


In [3]:
class CFG1:
    model = "microsoft/deberta-v3-base"
    path = "../input/0911-deberta-v3-base/"
    base = "../input/fb3models/microsoft-deberta-v3-base/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=24
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=8
    weight = 1.0
    
CFG_list = [CFG1]

In [4]:
def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [5]:
# FROM OLD 
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# ====================================================
# oof
# ====================================================
for CFG in CFG_list:
    oof_df = pd.read_pickle(CFG.path+'oof_df.pkl')
    labels = oof_df[CFG.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
    score, scores = get_score(labels, preds)
    LOGGER.info(f'Model: {CFG.model} Score: {score:<.4f}  Scores: {scores}')
    
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        #max_length=CFG.max_len,
        #pad_to_max_length=True,
        #truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

# new for CL
class CommonlitDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs
    
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim = 1)
        return max_embeddings
    
class MinPooling(nn.Module):
    def __init__(self):
        super(MinPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e-4
        min_embeddings, _ = torch.min(embeddings, dim = 1)
        return min_embeddings
        

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = AutoConfig.from_pretrained(config_path, output_hidden_states=True)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in test_loader:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

Model: microsoft/deberta-v3-base Score: 0.4595  Scores: [0.4933251819697969, 0.4502769020486089, 0.4195917881022107, 0.4616825211894006, 0.479171494341016, 0.4531265111349054]


In [8]:
prompts_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
prompts_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")

summaries_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
summaries_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")

sample_submission = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")

train = summaries_train.merge(prompts_train, how="left", on="prompt_id")
test = summaries_test.merge(prompts_test, how="left", on="prompt_id")

In [6]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/sentence-transformers/minilm-l6-v2/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('/kaggle/input/sentence-transformers/minilm-l6-v2/all-MiniLM-L6-v2')
MEMORY = {}

def get_emb(sentences):
    if sentences in MEMORY:
        return MEMORY[sentences]
    # Tokenize sentences
    encoded_input = tokenizer([sentences], padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)[0].detach().cpu().numpy()
    MEMORY[sentences] = sentence_embeddings
    
    return sentence_embeddings

In [7]:
def get_feedback_feat(data):
    for _idx, CFG in enumerate(CFG_list):
        dataset = CommonlitDataset(CFG, data)
        loader = DataLoader(dataset,
                            batch_size=CFG.batch_size,
                            shuffle=False,
                            collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                            num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

        predictions = []
        for fold in CFG.trn_fold:
            print('='*10, f'started fold {fold}', '='*10)
            model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
            state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                               map_location=torch.device('cpu'))
            model.load_state_dict(state['model'])
            prediction = inference_fn(loader, model, device)
            predictions.append(prediction)
            torch.cuda.empty_cache()
        predictions = np.mean(predictions, axis=0)
        data[CFG.target_cols] = predictions
        torch.cuda.empty_cache() 
        return data

In [11]:
from tqdm.notebook import tqdm
tqdm.pandas()
train_fe = get_feedback_feat(train)



  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [12]:
test_fe = get_feedback_feat(test)



  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [15]:
train_fe=train_fe[['student_id', 'prompt_id','cohesion', 'syntax', 'vocabulary', 'phraseology',
       'grammar', 'conventions']]

In [16]:
test_fe=test_fe[['student_id', 'prompt_id','cohesion', 'syntax', 'vocabulary', 'phraseology',
       'grammar', 'conventions']]

In [72]:
train_fe.head()

Unnamed: 0,student_id,prompt_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,000e8c3c7ddb,814d6b,2.630362,2.736776,2.938253,2.911925,2.962243,2.741032
1,0020ae56ffbf,ebad26,2.370722,2.624406,2.75535,2.826561,3.187674,2.88004
2,004e978e639e,3b9047,3.495633,3.489371,3.807075,3.725072,3.68201,3.615526
3,005ab0199905,3b9047,1.858357,1.965965,2.114809,1.965487,2.173315,1.977616
4,0070c9e7af47,814d6b,3.432068,3.570047,3.673658,3.723548,3.788458,3.302863


In [73]:
test_fe

Unnamed: 0,student_id,prompt_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,000000ffffff,abc123,0.665357,0.691386,0.718441,0.959863,1.010057,1.015537
1,111111eeeeee,def789,0.639788,0.672913,0.685465,0.956304,0.984577,1.012808
2,222222cccccc,abc123,0.622517,0.658201,0.657087,0.945119,0.958512,1.008729
3,333333dddddd,def789,0.621774,0.660764,0.648963,0.935018,0.964885,0.99559


In [17]:
from typing import List
import numpy as np
import pandas as pd
import warnings
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
import lightgbm as lgb

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [18]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [19]:
class CFG:
    model_name="debertav3base"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=5
    n_splits=4
    batch_size=12
    random_seed=42
    save_steps=100
    max_length=512

## Dataload

In [20]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

## Preprocess

[Using features]

- Text Length
- Length Ratio
- Word Overlap
- N-grams Co-occurrence
  - count
  - ratio
- Quotes Overlap
- Grammar Check
  - spelling: pyspellchecker


In [21]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{model_name}")
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=CFG.model_name)

In [22]:
train = preprocessor.run(prompts_train, summaries_train, mode="train")
test = preprocessor.run(prompts_test, summaries_test, mode="test")

train.head()

100%|██████████| 7165/7165 [10:51<00:00, 11.00it/s]
100%|██████████| 7165/7165 [00:01<00:00, 4901.85it/s]
100%|██████████| 7165/7165 [00:01<00:00, 6464.90it/s]
100%|██████████| 7165/7165 [00:01<00:00, 3768.63it/s]
100%|██████████| 7165/7165 [00:02<00:00, 3122.70it/s]
100%|██████████| 7165/7165 [00:00<00:00, 46627.53it/s]
100%|██████████| 4/4 [00:00<00:00, 8388.61it/s]
100%|██████████| 4/4 [00:00<00:00, 7108.99it/s]
100%|██████████| 4/4 [00:00<00:00, 2091.14it/s]
100%|██████████| 4/4 [00:00<00:00, 2580.32it/s]
100%|██████████| 4/4 [00:00<00:00, 2563.76it/s]
100%|██████████| 4/4 [00:00<00:00, 2679.21it/s]


Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,length_ratio,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,0.09697,14,4,0.063492,0,0.0,0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1076,0.050186,18,22,0.415094,10,0.192308,0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,"In Egypt, there were many occupations and soci...",32,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,0.4304,22,52,0.19403,23,0.086142,2
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,The highest class was Pharaohs these people we...,5,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,0.0448,6,6,0.222222,5,0.192308,0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,The Third Wave developed rapidly because the ...,29,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,0.351515,23,27,0.116883,5,0.021739,4


In [25]:
gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,length_ratio,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,0.09697,14,4,0.063492,0,0.0,0,2.0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1076,0.050186,18,22,0.415094,10,0.192308,0,0.0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,"In Egypt, there were many occupations and soci...",32,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,0.4304,22,52,0.19403,23,0.086142,2,1.0
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,The highest class was Pharaohs these people we...,5,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,0.0448,6,6,0.222222,5,0.192308,0,1.0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,The Third Wave developed rapidly because the ...,29,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,0.351515,23,27,0.116883,5,0.021739,4,2.0


## Model Function Definition

In [26]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

## Deberta Regressor

In [27]:
class ContentScoreRegressor:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "fixed_summary_text"]
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = [target]

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        
        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{model_name}")
        self.model_config = AutoConfig.from_pretrained(f"/kaggle/input/{model_name}")
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression",
        })
        
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def train(self, 
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""
        
        sep = self.tokenizer.sep_token
        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep 
                    + train_df["prompt_question"] + sep 
                    + train_df["fixed_summary_text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep 
                    + valid_df["prompt_question"] + sep 
                    + valid_df["fixed_summary_text"]
                  )
        
        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]
        
        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"/kaggle/input/{self.model_name}", 
            config=self.model_config
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
    
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )

        trainer.train()
        
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["fixed_summary_text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 

        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

In [28]:
def train_by_fold(
        train_df: pd.DataFrame,
        model_name: str,
        target:str,
        save_each_model: bool,
        n_splits: int,
        batch_size: int,
        learning_rate: int,
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        weight_decay: float,
        num_train_epochs: int,
        save_steps: int,
        max_length:int
    ):

    # delete old model files
    if os.path.exists(model_name):
        shutil.rmtree(model_name)
    
    os.mkdir(model_name)
        
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        train_data = train_df[train_df["fold"] != fold]
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        csr.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data, 
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )

def validate(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"
        
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"{target}_pred"] = pred

    return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"{target}_pred_{fold}"] = pred
    
    test_df[f"{target}_pred"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [29]:
for target in ["content", "wording"]:
    train_by_fold(
        train,
        model_name=CFG.model_name,
        save_each_model=False,
        target=target,
        learning_rate=CFG.learning_rate,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        weight_decay=CFG.weight_decay,
        num_train_epochs=CFG.num_train_epochs,
        n_splits=CFG.n_splits,
        batch_size=CFG.batch_size,
        save_steps=CFG.save_steps,
        max_length=CFG.max_length
    )
    
    
    train = validate(
        train,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

    rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

    test = predict(
        test,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

fold 0:


Step,Training Loss,Validation Loss


fold 1:


Step,Training Loss,Validation Loss


fold 2:


Step,Training Loss,Validation Loss


fold 3:


Step,Training Loss,Validation Loss


fold 0:


fold 1:


fold 2:


fold 3:


cv content rmse: 1.4755575641750034
fold 0:


fold 1:


fold 2:


fold 3:


fold 0:


Step,Training Loss,Validation Loss


fold 1:


Step,Training Loss,Validation Loss


fold 2:


Step,Training Loss,Validation Loss


fold 3:


Step,Training Loss,Validation Loss


fold 0:


fold 1:


fold 2:


fold 3:


cv wording rmse: 1.6674877126454308
fold 0:


fold 1:


fold 2:


fold 3:


In [30]:
train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,...,length_ratio,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count,fold,content_pred,wording_pred
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,...,0.09697,14,4,0.063492,0,0.0,0,2.0,0.06269,0.1713
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,...,0.050186,18,22,0.415094,10,0.192308,0,0.0,0.193532,0.198878
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,"In Egypt, there were many occupations and soci...",32,"In complete sentences, summarize the structure...",Egyptian Social Structure,...,0.4304,22,52,0.19403,23,0.086142,2,1.0,0.177838,0.225358
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,The highest class was Pharaohs these people we...,5,"In complete sentences, summarize the structure...",Egyptian Social Structure,...,0.0448,6,6,0.222222,5,0.192308,0,1.0,0.069185,0.160386
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,The Third Wave developed rapidly because the ...,29,Summarize how the Third Wave developed over su...,The Third Wave,...,0.351515,23,27,0.116883,5,0.021739,4,2.0,0.146355,0.224956


In [31]:
test

Unnamed: 0,student_id,prompt_id,text,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,...,content_pred_0,content_pred_1,content_pred_2,content_pred_3,content_pred,wording_pred_0,wording_pred_1,wording_pred_2,wording_pred_3,wording_pred
0,000000ffffff,abc123,Example text 1,3,Example text 1,0,Summarize...,Example Title 1,Heading\nText...,3,...,0.173345,0.068001,0.028422,0.171641,0.110352,0.181412,0.153289,0.143878,0.187127,0.166426
1,111111eeeeee,def789,Example text 2,3,Example text 2,0,Summarize...,Example Title 2,Heading\nText...,3,...,0.17198,0.064784,0.024393,0.170073,0.107807,0.180229,0.151679,0.141662,0.186066,0.164909
2,222222cccccc,abc123,Example text 3,3,Example text 3,0,Summarize...,Example Title 1,Heading\nText...,3,...,0.17654,0.075645,0.037729,0.174591,0.116126,0.184033,0.157132,0.148834,0.189274,0.169818
3,333333dddddd,def789,Example text 4,3,Example text 4,0,Summarize...,Example Title 2,Heading\nText...,3,...,0.175369,0.072208,0.03344,0.173259,0.113569,0.183079,0.155678,0.14679,0.188464,0.168503


In [32]:
train=train.merge(train_fe,how='inner',on=['student_id', 'prompt_id'])

In [33]:
test=test.merge(test_fe,how='inner',on=['student_id', 'prompt_id'])

In [34]:
from sklearn.metrics import r2_score
import os
import glob
from tqdm import tqdm
from joblib import Parallel, delayed

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import optuna
from optuna.samplers import TPESampler
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


import string
import pickle
import operator
from textblob import TextBlob

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from tqdm import tqdm
tqdm.pandas()

In [35]:
# Load model from HuggingFace Hub
tokenizer_sb = AutoTokenizer.from_pretrained('/kaggle/input/sentence-transformers/sentence-transformers/minil2-l12-v2/paraphrase-multilingual-MiniLM-L12-v2')
model_sb = AutoModel.from_pretrained('/kaggle/input/sentence-transformers/sentence-transformers/minil2-l12-v2/paraphrase-multilingual-MiniLM-L12-v2').cuda()
MEMORY_SB = {}

tokenizer_qa = AutoTokenizer.from_pretrained('/kaggle/input/multi-qa-mpnet-base-dot-v1/multi-qa-mpnet-base-dot-v1')
model_qa = AutoModel.from_pretrained('/kaggle/input/multi-qa-mpnet-base-dot-v1/multi-qa-mpnet-base-dot-v1').cuda()
MEMORY_QA = {}

tokenizer_ch = AutoTokenizer.from_pretrained('/kaggle/input/bert-cohesion/bert_cohesion')
model_ch = AutoModelForSequenceClassification.from_pretrained('/kaggle/input/bert-cohesion/bert_cohesion', num_labels=9).cuda()

tokenizer_rw = AutoTokenizer.from_pretrained('/kaggle/input/reward-model-deberta-v3-large-v2-wch/reward-model-deberta-v3-large-v2-wch')
model_rw = AutoModelForSequenceClassification.from_pretrained('/kaggle/input/reward-model-deberta-v3-large-v2-wch/reward-model-deberta-v3-large-v2-wch',
                                                              num_labels=1).cuda()

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def cls_pooling(model_output):
    return model_output.last_hidden_state[:,0]

def get_emb_sb(sentences):
    if sentences in MEMORY_SB:
        return MEMORY_SB[sentences]
    # Tokenize sentences
    encoded_input = tokenizer_sb([sentences], padding=True, truncation=True, return_tensors='pt')
    encoded_input = {k:v.cuda() for k, v in encoded_input.items()}

    # Compute token embeddings
    with torch.no_grad():
        model_output = model_sb(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)[0].detach().cpu().numpy()
    MEMORY_SB[sentences] = sentence_embeddings
    
    return sentence_embeddings

def get_emb_qa(sentences):
    if sentences in MEMORY_QA:
        return MEMORY_QA[sentences]
    # Tokenize sentences
    encoded_input = tokenizer_qa([sentences], padding=True, truncation=True, return_tensors='pt')
    encoded_input = {k:v.cuda() for k, v in encoded_input.items()}
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model_qa(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)[0].detach().cpu().numpy()
    MEMORY_QA[sentences] = sentence_embeddings
    
    return sentence_embeddings

def get_ch_label(sentences):
    encoded_input = tokenizer_ch([sentences], padding=True, truncation=True, return_tensors='pt')
    encoded_input = {k:v.cuda() for k, v in encoded_input.items()}
    
    with torch.no_grad():
        model_output = model_ch(**encoded_input)["logits"]
    
#     print(model_output)
    label = torch.argmax(model_output, dim=1)[0].detach().cpu().numpy()
    
    return label

def get_rw(question, answer):
    encoded_input = tokenizer_rw(question, answer, padding=True, truncation=True, return_tensors='pt')
    encoded_input = {k:v.cuda() for k, v in encoded_input.items()}
    
    with torch.no_grad():
        model_output = model_rw(**encoded_input)["logits"]
    
    return torch.sigmoid(model_output)[0].cpu().detach().numpy()[0]

In [36]:
def pos_count(sent):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    jj_count = 0   #Adjective
    uh_count = 0   #Interjection
    cd_count = 0   #Numerics
    
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)

    for token in sent:
        if token[1] in ['NN','NNP','NNS']:
            nn_count += 1

        if token[1] in ['PRP','PRP$']:
            pr_count += 1

        if token[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']:
            vb_count += 1

        if token[1] in ['JJ','JJR','JJS']:
            jj_count += 1

        if token[1] in ['UH']:
            uh_count += 1

        if token[1] in ['CD']:
            cd_count += 1
    
    return pd.Series([nn_count, pr_count, vb_count, jj_count, uh_count, cd_count])

In [37]:
train["num_words"] = train["text"].progress_apply(lambda x: len(str(x).split()))
train["num_unique_words"] = train["text"].progress_apply(lambda x: len(set(str(x).split())))
train["num_chars"] = train["text"].progress_apply(lambda x: len(str(x)))
train["num_stopwords"] = train["text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
train["num_punctuations"] =train['text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
train["num_words_upper"] = train["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
train["num_words_title"] = train["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
train["mean_word_len"] = train["text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
train["mean_num_unique_words"] = train["text"].progress_apply(lambda x: np.mean([len(set(e.split())) for e in x.split('.')]))
train["num_paragraphs"] = train["text"].progress_apply(lambda x: len(x.split('\n\n')))
train["num_slash"] = train["text"].progress_apply(lambda x: len(x.split('\n')))
train["syntax_count"] = train["text"].progress_apply(lambda x: x.count(",") + x.count("-") + x.count(";") + x.count(":"))
train["num_sentences"] = train["text"].progress_apply(lambda x: len(str(x).split('.')))
train["polarity"] = train['text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
train["subjectivity"] = train['text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
train[['nn_count','pr_count','vb_count','jj_count','uh_count','cd_count']] = train['text'].progress_apply(pos_count)

100%|██████████| 10/10 [00:00<00:00, 3718.68it/s]
100%|██████████| 10/10 [00:00<00:00, 8395.32it/s]
100%|██████████| 10/10 [00:00<00:00, 15454.33it/s]
100%|██████████| 10/10 [00:00<00:00, 103.06it/s]
100%|██████████| 10/10 [00:00<00:00, 1049.34it/s]
100%|██████████| 10/10 [00:00<00:00, 14378.83it/s]
100%|██████████| 10/10 [00:00<00:00, 12387.19it/s]
100%|██████████| 10/10 [00:00<00:00, 4546.67it/s]
100%|██████████| 10/10 [00:00<00:00, 9139.91it/s]
100%|██████████| 10/10 [00:00<00:00, 16630.86it/s]
100%|██████████| 10/10 [00:00<00:00, 21777.28it/s]
100%|██████████| 10/10 [00:00<00:00, 21454.24it/s]
100%|██████████| 10/10 [00:00<00:00, 22550.02it/s]
100%|██████████| 10/10 [00:00<00:00, 158.36it/s]
100%|██████████| 10/10 [00:00<00:00, 1182.49it/s]
100%|██████████| 10/10 [00:00<00:00, 60.38it/s]


In [38]:
test["num_words"] = test["text"].progress_apply(lambda x: len(str(x).split()))
test["num_unique_words"] = test["text"].progress_apply(lambda x: len(set(str(x).split())))
test["num_chars"] = test["text"].progress_apply(lambda x: len(str(x)))
test["num_stopwords"] = test["text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
test["num_punctuations"] =test['text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
test["num_words_upper"] = test["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test["num_words_title"] = test["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test["mean_word_len"] = test["text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test["mean_num_unique_words"] = test["text"].progress_apply(lambda x: np.mean([len(set(e.split())) for e in x.split('.')]))
test["num_paragraphs"] = test["text"].progress_apply(lambda x: len(x.split('\n\n')))
test["num_slash"] = test["text"].progress_apply(lambda x: len(x.split('\n')))
test["syntax_count"] = test["text"].progress_apply(lambda x: x.count(",") + x.count("-") + x.count(";") + x.count(":"))
test["num_sentences"] = test["text"].progress_apply(lambda x: len(str(x).split('.')))
test["polarity"] = test['text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
test["subjectivity"] = test['text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
test[['nn_count','pr_count','vb_count','jj_count','uh_count','cd_count']] = test['text'].progress_apply(pos_count)

100%|██████████| 4/4 [00:00<00:00, 3581.81it/s]
100%|██████████| 4/4 [00:00<00:00, 5384.22it/s]
100%|██████████| 4/4 [00:00<00:00, 6589.64it/s]
100%|██████████| 4/4 [00:00<00:00, 1231.17it/s]
100%|██████████| 4/4 [00:00<00:00, 5491.72it/s]
100%|██████████| 4/4 [00:00<00:00, 6932.73it/s]
100%|██████████| 4/4 [00:00<00:00, 6904.20it/s]
100%|██████████| 4/4 [00:00<00:00, 5399.81it/s]
100%|██████████| 4/4 [00:00<00:00, 7136.20it/s]
100%|██████████| 4/4 [00:00<00:00, 9759.87it/s]
100%|██████████| 4/4 [00:00<00:00, 10230.01it/s]
100%|██████████| 4/4 [00:00<00:00, 9686.61it/s]
100%|██████████| 4/4 [00:00<00:00, 9608.94it/s]
100%|██████████| 4/4 [00:00<00:00, 2107.43it/s]
100%|██████████| 4/4 [00:00<00:00, 2701.21it/s]
100%|██████████| 4/4 [00:00<00:00, 767.91it/s]


In [39]:
def get_stat_features(df, text_col="text"):
    df['diff_emb_sb'] = df.progress_apply(lambda x: np.sum(get_emb_sb(x["text"])*get_emb_sb(x["prompt_text"])), axis=1)
    df['diff_emb_qa'] = df.progress_apply(lambda x: np.sum(get_emb_qa(x["text"])*get_emb_qa(x["prompt_question"])), axis=1)
    df['ch'] = df.progress_apply(lambda x: get_ch_label(x["text"]), axis=1)
    df['rw'] = df.progress_apply(lambda x: get_rw(x["prompt_question"], x["text"]), axis=1)

    df['automated_readability_index'] = df[text_col].progress_apply(lambda x: textstat.automated_readability_index(x))
    df['coleman_liau_index'] = df[text_col].progress_apply(lambda x: textstat.coleman_liau_index(x))
    df['smog_index'] = df[text_col].progress_apply(lambda x: textstat.smog_index(x))
    df['flesch_reading_ease'] = df[text_col].progress_apply(lambda x: textstat.flesch_reading_ease(x))
    df['flesch_kincaid_grade'] = df[text_col].progress_apply(lambda x: textstat.flesch_kincaid_grade(x))
    df['dale_chall_readability_score'] = df[text_col].progress_apply(lambda x: textstat.dale_chall_readability_score(x))
    df['linsear_write_formula'] = df[text_col].progress_apply(lambda x: textstat.linsear_write_formula(x))
    df['gunning_fog'] = df[text_col].progress_apply(lambda x: textstat.gunning_fog(x))
    df['text_standard_float'] = df[text_col].progress_apply(lambda x: textstat.text_standard(x, float_output=True))
    df['spache_readability'] = df[text_col].progress_apply(lambda x: textstat.spache_readability(x))
    df['rix'] = df[text_col].progress_apply(lambda x: textstat.rix(x))
    df['lix'] = df[text_col].progress_apply(lambda x: textstat.lix(x))

    return df

In [40]:
train= get_stat_features(train)

100%|██████████| 10/10 [00:00<00:00, 46.66it/s]
100%|██████████| 10/10 [00:00<00:00, 47.14it/s]
100%|██████████| 10/10 [00:00<00:00, 82.60it/s]
100%|██████████| 10/10 [00:00<00:00, 21.76it/s]
100%|██████████| 10/10 [00:00<00:00, 2661.19it/s]
100%|██████████| 10/10 [00:00<00:00, 6903.07it/s]
100%|██████████| 10/10 [00:00<00:00, 75.94it/s]
100%|██████████| 10/10 [00:00<00:00, 1428.97it/s]
100%|██████████| 10/10 [00:00<00:00, 12018.06it/s]
100%|██████████| 10/10 [00:00<00:00, 18.51it/s]
100%|██████████| 10/10 [00:00<00:00, 2201.39it/s]
100%|██████████| 10/10 [00:00<00:00, 17.71it/s]
100%|██████████| 10/10 [00:00<00:00, 9337.28it/s]
100%|██████████| 10/10 [00:00<00:00, 15103.72it/s]
100%|██████████| 10/10 [00:00<00:00, 11235.75it/s]
100%|██████████| 10/10 [00:00<00:00, 11146.17it/s]


In [41]:
test= get_stat_features(test)

100%|██████████| 4/4 [00:00<00:00, 78.12it/s]
100%|██████████| 4/4 [00:00<00:00, 85.41it/s]
100%|██████████| 4/4 [00:00<00:00, 103.46it/s]
100%|██████████| 4/4 [00:00<00:00, 26.85it/s]
100%|██████████| 4/4 [00:00<00:00, 4843.31it/s]
100%|██████████| 4/4 [00:00<00:00, 5581.24it/s]
100%|██████████| 4/4 [00:00<00:00, 9399.00it/s]
100%|██████████| 4/4 [00:00<00:00, 5473.81it/s]
100%|██████████| 4/4 [00:00<00:00, 8751.81it/s]
100%|██████████| 4/4 [00:00<00:00, 373.47it/s]
100%|██████████| 4/4 [00:00<00:00, 4556.55it/s]
100%|██████████| 4/4 [00:00<00:00, 517.37it/s]
100%|██████████| 4/4 [00:00<00:00, 5221.67it/s]
100%|██████████| 4/4 [00:00<00:00, 6408.41it/s]
100%|██████████| 4/4 [00:00<00:00, 9399.00it/s]
100%|██████████| 4/4 [00:00<00:00, 9581.51it/s]


In [42]:
import nltk
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter
from nltk import ne_chunk, word_tokenize, pos_tag
import dask.dataframe as dd
from dask.multiprocessing import get

In [43]:
# Additional features
train['comma_count'] = train['text'].apply(lambda x: x.count(','))
train['semicolon_count'] = train['text'].apply(lambda x: x.count(';'))
train['exclamation_count'] = train['text'].apply(lambda x: x.count('!'))
train['question_count'] = train['text'].apply(lambda x: x.count('?'))

def calculate_pos_ratios(text):
    pos_tags = pos_tag(nltk.word_tokenize(text))
    pos_counts = Counter(tag for word, tag in pos_tags)
    total_words = len(pos_tags)
    ratios = {tag: count / total_words for tag, count in pos_counts.items()}
    return ratios
# Calculate POS ratios for each row
train['pos_ratios'] = train['text'].apply(calculate_pos_ratios)

# Convert the dictionary of POS ratios into a single value (mean)
train['pos_mean'] = train['pos_ratios'].apply(lambda x: np.mean(list(x.values())))

sid = SentimentIntensityAnalyzer()
def calculate_sentiment_scores(text):
    sentiment_scores = sid.polarity_scores(text)
    return sentiment_scores
 
#Calculate sentiment scores for each row
train['sentiment_scores'] = train['text'].apply(calculate_sentiment_scores)

# Convert sentiment_scores into individual columns
sentiment_columns = pd.DataFrame(list(train['sentiment_scores']))
train = pd.concat([train, sentiment_columns], axis=1)

def calculate_punctuation_ratios(text):
    total_chars = len(text)
    punctuation_counts = Counter(char for char in text if char in '.,!?;:"()[]{}')
    ratios = {char: count / total_chars for char, count in punctuation_counts.items()}
    return ratios

train['punctuation_ratios'] = train['text'].apply(calculate_punctuation_ratios)

# Convert the dictionary of punctuation ratios into a single value (sum)
train['punctuation_sum'] = train['punctuation_ratios'].apply(lambda x: np.sum(list(x.values())))

# keyword density
def calculate_keyword_density(row):
    keywords = set(row['prompt_text'].split())
    text_words = row['text'].split()
    keyword_count = sum(1 for word in text_words if word in keywords)
    return keyword_count / len(text_words)

train['keyword_density'] = train.apply(calculate_keyword_density, axis=1)

# Function to calculate sentiment scores for prompt text
sid = SentimentIntensityAnalyzer()
def calculate_sentiment_scores_prompt(text):
    sentiment_scores = sid.polarity_scores(text)
    return sentiment_scores

# Calculate sentiment scores for prompt text
train['sentiment_scores_prompt'] = train['prompt_text'].apply(calculate_sentiment_scores_prompt)

# Convert sentiment_scores_prompt into individual columns
sentiment_columns_prompt = pd.DataFrame(list(train['sentiment_scores_prompt']))
sentiment_columns_prompt.columns = [col +'_prompt' for col in sentiment_columns_prompt.columns]

train = pd.concat([train, sentiment_columns_prompt], axis=1)

# Jaccard similarity between prompt text and text
train['jaccard_similarity'] = train.apply(lambda row: len(set(word_tokenize(row['prompt_text'])) & set(word_tokenize(row['text']))) / len(set(word_tokenize(row['prompt_text'])) | set(word_tokenize(row['text']))), axis=1)

In [44]:
drop_col=['pos_ratios','sentiment_scores','punctuation_ratios','sentiment_scores_prompt']

In [45]:
train.drop(drop_col, axis=1, inplace=True)

In [46]:
test['comma_count'] = test['text'].apply(lambda x: x.count(','))
test['semicolon_count'] = test['text'].apply(lambda x: x.count(';'))
test['exclamation_count'] = test['text'].apply(lambda x: x.count('!'))
test['question_count'] = test['text'].apply(lambda x: x.count('?'))
# Calculate POS ratios for each row
test['pos_ratios'] = test['text'].apply(calculate_pos_ratios)

# Convert the dictionary of POS ratios into a single value (mean)
test['pos_mean'] = test['pos_ratios'].apply(lambda x: np.mean(list(x.values())))
 
#Calculate sentiment scores for each row
test['sentiment_scores'] = test['text'].apply(calculate_sentiment_scores)

# Convert sentiment_scores into individual columns
sentiment_columns = pd.DataFrame(list(test['sentiment_scores']))
test= pd.concat([test, sentiment_columns], axis=1)

test['punctuation_ratios'] = test['text'].apply(calculate_punctuation_ratios)

# Convert the dictionary of punctuation ratios into a single value (sum)
test['punctuation_sum'] = test['punctuation_ratios'].apply(lambda x: np.sum(list(x.values())))

test['keyword_density'] = test.apply(calculate_keyword_density, axis=1)


# Calculate sentiment scores for prompt text
test['sentiment_scores_prompt'] = test['prompt_text'].apply(calculate_sentiment_scores_prompt)

# Convert sentiment_scores_prompt into individual columns
sentiment_columns_prompt = pd.DataFrame(list(test['sentiment_scores_prompt']))
sentiment_columns_prompt.columns = [col +'_prompt' for col in sentiment_columns_prompt.columns]

test = pd.concat([test, sentiment_columns_prompt], axis=1)

# Jaccard similarity between prompt text and text
test['jaccard_similarity'] = test.apply(lambda row: len(set(word_tokenize(row['prompt_text'])) & set(word_tokenize(row['text']))) / len(set(word_tokenize(row['prompt_text'])) | set(word_tokenize(row['text']))), axis=1)

In [47]:
test.drop(drop_col, axis=1, inplace=True)

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob

In [49]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{model_name}")
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
    
        
    def calculate_text_similarity(self, row):
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([row['prompt_text'], row['text']])
        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2]).flatten()[0]
    
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess

                # before merge preprocess
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        input_df = summaries.merge(prompts, how="left", on="prompt_id")
        tqdm.pandas(desc="Calculating Text Similarity")
        input_df['text_similarity'] = input_df.progress_apply(self.calculate_text_similarity, axis=1)

        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=CFG.model_name)

In [50]:
train_2 = preprocessor.run(prompts_train, summaries_train, mode="train")
test_2 = preprocessor.run(prompts_test, summaries_test, mode="test")

Calculating Text Similarity: 100%|██████████| 7165/7165 [00:33<00:00, 210.89it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 260.17it/s]


In [51]:
train_2=train_2[['student_id', 'prompt_id','text_similarity']]
test_2=test_2[['student_id', 'prompt_id','text_similarity']]

In [52]:
train=train.merge(train_2,how='inner',on=['student_id', 'prompt_id'])
test=test.merge(test_2,how='inner',on=['student_id', 'prompt_id'])

In [53]:
train.columns

Index(['student_id', 'prompt_id', 'text', 'content', 'wording',
       'summary_length', 'fixed_summary_text', 'splling_err_num',
       'prompt_question', 'prompt_title', 'prompt_text', 'prompt_length',
       'length_ratio', 'word_overlap_count', 'bigram_overlap_count',
       'bigram_overlap_ratio', 'trigram_overlap_count',
       'trigram_overlap_ratio', 'quotes_count', 'fold', 'content_pred',
       'wording_pred', 'cohesion', 'syntax', 'vocabulary', 'phraseology',
       'grammar', 'conventions', 'num_words', 'num_unique_words', 'num_chars',
       'num_stopwords', 'num_punctuations', 'num_words_upper',
       'num_words_title', 'mean_word_len', 'mean_num_unique_words',
       'num_paragraphs', 'num_slash', 'syntax_count', 'num_sentences',
       'polarity', 'subjectivity', 'nn_count', 'pr_count', 'vb_count',
       'jj_count', 'uh_count', 'cd_count', 'diff_emb_sb', 'diff_emb_qa', 'ch',
       'rw', 'automated_readability_index', 'coleman_liau_index', 'smog_index',
       'flesch

In [54]:
test.columns

Index(['student_id', 'prompt_id', 'text', 'summary_length',
       'fixed_summary_text', 'splling_err_num', 'prompt_question',
       'prompt_title', 'prompt_text', 'prompt_length', 'length_ratio',
       'word_overlap_count', 'bigram_overlap_count', 'bigram_overlap_ratio',
       'trigram_overlap_count', 'trigram_overlap_ratio', 'quotes_count',
       'input', 'content_pred_0', 'content_pred_1', 'content_pred_2',
       'content_pred_3', 'content_pred', 'wording_pred_0', 'wording_pred_1',
       'wording_pred_2', 'wording_pred_3', 'wording_pred', 'cohesion',
       'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions',
       'num_words', 'num_unique_words', 'num_chars', 'num_stopwords',
       'num_punctuations', 'num_words_upper', 'num_words_title',
       'mean_word_len', 'mean_num_unique_words', 'num_paragraphs', 'num_slash',
       'syntax_count', 'num_sentences', 'polarity', 'subjectivity', 'nn_count',
       'pr_count', 'vb_count', 'jj_count', 'uh_count', 'cd_count',


In [59]:
train['ch']=train['ch'].astype(float)
test['ch']=test['ch'].astype(float)

In [55]:
no_features = ["student_id", "prompt_id", "prompt_question", "prompt_title", "prompt_text",'fold','fixed_summary_text','text']
target = ["content", "wording"]
fe_columns = [col for col in train.columns if col not in no_features + target]

corr = train[fe_columns + target].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,summary_length,splling_err_num,prompt_length,length_ratio,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count,content_pred,wording_pred,cohesion,syntax,vocabulary,phraseology,grammar,conventions,num_words,num_unique_words,num_chars,num_stopwords,num_punctuations,num_words_upper,num_words_title,mean_word_len,mean_num_unique_words,num_paragraphs,num_slash,syntax_count,num_sentences,polarity,subjectivity,nn_count,pr_count,vb_count,jj_count,uh_count,cd_count,diff_emb_sb,diff_emb_qa,rw,automated_readability_index,coleman_liau_index,smog_index,flesch_reading_ease,flesch_kincaid_grade,dale_chall_readability_score,linsear_write_formula,gunning_fog,text_standard_float,spache_readability,rix,lix,comma_count,semicolon_count,exclamation_count,question_count,pos_mean,neg,neu,pos,compound,punctuation_sum,keyword_density,neg_prompt,neu_prompt,pos_prompt,compound_prompt,jaccard_similarity,text_similarity,content,wording
summary_length,1.0,0.981367,-0.373602,0.994705,0.834953,0.870451,-0.055059,0.663009,-0.156139,0.880832,0.11341,0.750099,0.912102,0.866014,0.88769,0.837851,0.720664,0.704062,0.999441,0.995783,0.998757,0.989338,0.97039,-0.038518,0.933312,0.23558,0.554148,,,0.839667,0.962358,0.203276,-0.113986,0.995246,0.968028,0.98132,0.655925,,0.686715,0.873574,-0.142084,0.13117,-0.295351,0.201005,0.648199,0.208904,-0.37822,0.00538,-0.459308,-0.383618,-0.150878,-0.345779,-0.162837,-0.202187,0.862986,,0.576137,,-0.775448,-0.244216,0.031016,0.308244,0.703836,0.071418,-0.276362,-0.397349,0.24792,0.102306,0.537933,0.939102,0.701964,0.963152,0.964541
splling_err_num,0.981367,1.0,-0.421844,0.98145,0.781961,0.835557,-0.078483,0.6452,-0.117438,0.903333,0.069182,0.719612,0.838365,0.786256,0.81158,0.750173,0.613031,0.584047,0.975405,0.976379,0.982341,0.957741,0.992071,-0.078376,0.945156,0.360896,0.408385,,,0.842916,0.976076,0.253032,-0.192839,0.991831,0.910606,0.965114,0.570836,,0.7311,0.918903,-0.214423,0.169248,-0.16741,0.327095,0.613383,0.066679,-0.262436,0.151423,-0.371603,-0.282449,-0.06227,-0.254293,-0.025239,-0.068189,0.868539,,0.611333,,-0.77685,-0.245252,0.043976,0.278701,0.688792,0.248288,-0.330342,-0.399546,0.235359,0.137535,0.575271,0.899815,0.647325,0.948816,0.931841
prompt_length,-0.373602,-0.421844,1.0,-0.46163,-0.125687,-0.151821,0.384701,-0.003927,0.345527,-0.322865,0.611272,0.144976,-0.2543,-0.177528,-0.157186,-0.066572,0.102077,0.041981,-0.371078,-0.372381,-0.397531,-0.364708,-0.422501,-0.269773,-0.422921,-0.673533,0.187099,,,-0.315883,-0.506716,-0.215537,0.376735,-0.36706,-0.304417,-0.455601,-0.569641,,-0.391714,-0.580749,0.285522,-0.38764,0.246572,-0.630671,-0.583353,0.401435,0.139668,-0.516869,0.484757,0.342697,-0.600388,0.340837,-0.318261,-0.051346,-0.292534,,-0.209671,,0.171648,-0.23815,0.368826,-0.505587,-0.155399,-0.388949,0.411828,0.01096,0.318689,-0.812763,-0.73513,-0.417201,0.316283,-0.41852,-0.332565
length_ratio,0.994705,0.98145,-0.46163,1.0,0.802317,0.852858,-0.09733,0.646371,-0.185444,0.862056,0.045389,0.696696,0.897106,0.842842,0.865188,0.806413,0.674015,0.671261,0.993863,0.989162,0.995663,0.981092,0.973742,-0.002401,0.926878,0.300068,0.510719,,,0.850897,0.968792,0.227645,-0.143642,0.989182,0.959018,0.987001,0.697875,,0.675814,0.901131,-0.166042,0.181494,-0.305096,0.263687,0.678999,0.158162,-0.376306,0.06117,-0.481536,-0.405869,-0.082022,-0.365323,-0.118062,-0.185759,0.868844,,0.547665,,-0.758004,-0.207947,-0.007618,0.343455,0.686714,0.111575,-0.31153,-0.379622,0.204612,0.177975,0.591449,0.944881,0.633711,0.961887,0.958895
word_overlap_count,0.834953,0.781961,-0.125687,0.802317,1.0,0.827051,0.270879,0.61732,0.011295,0.756356,0.228584,0.758881,0.774933,0.76272,0.765079,0.7491,0.701345,0.628489,0.840103,0.854381,0.821282,0.874803,0.747174,-0.055244,0.78177,-0.209073,0.694569,,,0.618477,0.745883,-0.105197,0.1013,0.809533,0.864739,0.805318,0.404662,,0.605359,0.641269,-0.088744,-0.135682,-0.343387,-0.262507,0.575757,0.333097,-0.324935,-0.115829,-0.363396,-0.245229,-0.431293,-0.215544,-0.512824,-0.407961,0.648102,,0.558577,,-0.841705,-0.267639,0.030943,0.345193,0.72565,-0.160205,-0.035851,-0.3587,0.274066,-0.032697,0.319984,0.83839,0.812935,0.72558,0.787635
bigram_overlap_count,0.870451,0.835557,-0.151821,0.852858,0.827051,1.0,0.382044,0.929512,0.25145,0.648955,0.333961,0.767153,0.709647,0.661211,0.702303,0.649423,0.59192,0.61873,0.869074,0.843515,0.846738,0.869254,0.857624,-0.175388,0.690621,-0.096607,0.562238,,,0.872323,0.769063,0.038864,-0.025149,0.859586,0.903202,0.806943,0.542278,,0.443513,0.828384,-0.287028,0.063275,-0.161572,-0.127818,0.348437,0.333393,-0.260321,-0.140116,-0.195636,-0.193278,-0.396935,-0.133809,-0.21515,-0.17242,0.891543,,0.274324,,-0.652501,-0.222088,0.036601,0.260679,0.602886,0.072671,0.141689,-0.256211,0.208335,-0.054645,0.293579,0.933653,0.801247,0.750339,0.834164
bigram_overlap_ratio,-0.055059,-0.078483,0.384701,-0.09733,0.270879,0.382044,1.0,0.533,0.898526,-0.109511,0.352236,0.17409,-0.242958,-0.251805,-0.248835,-0.244563,-0.129626,-0.146598,-0.054252,-0.083806,-0.093292,0.005079,-0.059113,-0.241244,-0.171705,-0.65766,0.059629,,,0.006639,-0.115515,-0.477979,-0.028068,-0.064815,0.050462,-0.1678,-0.24482,,-0.103122,0.049345,-0.391872,-0.435096,0.213856,-0.662087,-0.407093,0.337223,0.153846,-0.338096,0.388237,0.339187,-0.587656,0.36484,-0.144418,0.053892,0.027837,,-0.152676,,0.051556,-0.197331,0.201544,-0.169947,0.043348,-0.001757,0.851304,0.036832,0.114889,-0.352514,-0.290955,0.103076,0.435719,-0.193322,-0.09953
trigram_overlap_count,0.663009,0.6452,-0.003927,0.646371,0.61732,0.929512,0.533,1.0,0.487829,0.381392,0.373934,0.621023,0.447688,0.385584,0.457168,0.396628,0.350283,0.419236,0.658085,0.621544,0.630184,0.64048,0.693393,-0.271345,0.417011,-0.165834,0.41879,,,0.832038,0.540592,0.049174,-0.061595,0.663821,0.707751,0.59297,0.385462,,0.175651,0.730076,-0.424476,0.049529,0.03804,-0.181574,0.096952,0.305316,-0.103366,-0.13066,0.076737,-0.012269,-0.477714,0.059545,-0.094087,-0.020723,0.844336,,-0.024668,,-0.47299,-0.233525,0.104577,0.11661,0.474895,0.168535,0.304306,-0.193782,0.216815,-0.188732,0.155043,0.775285,0.717617,0.519431,0.636631
trigram_overlap_ratio,-0.156139,-0.117438,0.345527,-0.185444,0.011295,0.25145,0.898526,0.487829,1.0,-0.176664,0.171584,-0.019398,-0.430242,-0.472425,-0.441907,-0.456553,-0.398655,-0.408937,-0.162721,-0.191766,-0.186706,-0.139874,-0.088622,-0.310487,-0.251019,-0.364825,-0.233826,,,-0.024401,-0.154263,-0.338404,-0.327451,-0.13186,-0.110339,-0.251007,-0.373622,,-0.165731,0.078714,-0.613117,-0.443563,0.435584,-0.350698,-0.502398,0.19052,0.271513,-0.180604,0.553207,0.437197,-0.491439,0.457171,0.186791,0.338781,-0.000361,,-0.217508,,0.187462,-0.333242,0.388522,-0.402896,0.027035,0.273297,0.755337,-0.08918,0.2347,-0.422549,-0.168983,-0.062675,0.300522,-0.237871,-0.192256
quotes_count,0.880832,0.903333,-0.322865,0.862056,0.756356,0.648955,-0.109511,0.381392,-0.176664,1.0,0.093683,0.731815,0.791048,0.786435,0.761103,0.739137,0.652373,0.535046,0.875057,0.892539,0.889204,0.88001,0.868242,-0.156174,0.981243,0.326907,0.341511,,,0.560993,0.928415,0.079532,-0.192024,0.896994,0.788764,0.846243,0.352619,,0.908186,0.720415,-0.09353,0.018365,-0.222049,0.285575,0.537833,0.096348,-0.298191,0.048601,-0.47457,-0.280186,-0.012925,-0.312879,-0.086152,-0.122855,0.60708,,0.884985,,-0.676731,-0.237469,0.053021,0.24518,0.630194,0.185982,-0.288915,-0.38293,0.226495,0.129517,0.476503,0.724613,0.637921,0.907542,0.837903


In [56]:
fe_columns

['summary_length',
 'splling_err_num',
 'prompt_length',
 'length_ratio',
 'word_overlap_count',
 'bigram_overlap_count',
 'bigram_overlap_ratio',
 'trigram_overlap_count',
 'trigram_overlap_ratio',
 'quotes_count',
 'content_pred',
 'wording_pred',
 'cohesion',
 'syntax',
 'vocabulary',
 'phraseology',
 'grammar',
 'conventions',
 'num_words',
 'num_unique_words',
 'num_chars',
 'num_stopwords',
 'num_punctuations',
 'num_words_upper',
 'num_words_title',
 'mean_word_len',
 'mean_num_unique_words',
 'num_paragraphs',
 'num_slash',
 'syntax_count',
 'num_sentences',
 'polarity',
 'subjectivity',
 'nn_count',
 'pr_count',
 'vb_count',
 'jj_count',
 'uh_count',
 'cd_count',
 'diff_emb_sb',
 'diff_emb_qa',
 'ch',
 'rw',
 'automated_readability_index',
 'coleman_liau_index',
 'smog_index',
 'flesch_reading_ease',
 'flesch_kincaid_grade',
 'dale_chall_readability_score',
 'linsear_write_formula',
 'gunning_fog',
 'text_standard_float',
 'spache_readability',
 'rix',
 'lix',
 'comma_count',


In [57]:
targets = ["content", "wording"]
features=['summary_length',
 'splling_err_num',
 'prompt_length',
 'length_ratio',
 'word_overlap_count',
 'bigram_overlap_count',
 'bigram_overlap_ratio',
 'trigram_overlap_count',
 'trigram_overlap_ratio',
 'quotes_count',
 'content_pred',
 'wording_pred']

In [58]:
model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train[train["fold"] != fold][features]
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold][features]
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)
        params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.048,
            'max_depth': 3, 
            'lambda_l1': 0.0,
            'lambda_l2': 0.011
        }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                               lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        print(lgb.plot_importance(model,max_num_features=30))
        models.append(model)
    model_dict[target] = models

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 7, number of used features: 0
[LightGBM] [Info] Start training from score 0.774532
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1]	train's rmse: 2.0082


In [None]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold][features]
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

In [None]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test[features]

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [None]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

In [None]:
test[["student_id", "content", "wording"]].to_csv("submission1.csv", index=False)

In [None]:
targets = ["content", "wording"]
features=['summary_length',
 'splling_err_num',
 'prompt_length',
 'length_ratio',
 'word_overlap_count',
 'bigram_overlap_count',
 'bigram_overlap_ratio',
 'trigram_overlap_count',
 'trigram_overlap_ratio',
 'quotes_count',
 'content_pred',
 'cohesion',
 'syntax',
 'vocabulary',
 'phraseology',
 'grammar',
 'conventions',
 'wording_pred',
 'num_words',
 'num_unique_words',
 'num_chars',
 'num_stopwords',
 'num_punctuations',
 'num_words_upper',
 'num_words_title',
 'mean_word_len',
 'mean_num_unique_words',
 'num_paragraphs',
 'num_slash',
 'syntax_count',
 'num_sentences',
 'polarity',
 'subjectivity',
 'nn_count',
 'pr_count',
 'vb_count',
 'jj_count',
 'uh_count',
 'cd_count',
 'diff_emb_sb',
 'diff_emb_qa',
 'ch',
 'rw',
 'automated_readability_index',
 'coleman_liau_index',
 'smog_index',
 'flesch_reading_ease',
 'flesch_kincaid_grade',
 'dale_chall_readability_score',
 'linsear_write_formula',
 'gunning_fog',
 'text_standard_float',
 'spache_readability',
 'rix',
 'lix']

In [None]:
model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train[train["fold"] != fold][features]
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold][features]
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)
        params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.048,
            'max_depth': 3, 
            'lambda_l1': 0.0,
            'lambda_l2': 0.011
        }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                               lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        print(lgb.plot_importance(model,max_num_features=30))
        models.append(model)
    model_dict[target] = models

In [None]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold][features]
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

In [None]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test[features]

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [None]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

In [None]:
test[["student_id", "content", "wording"]].to_csv("submission2.csv", index=False)

In [None]:
targets = ["content", "wording"]
features=fe_columns

In [None]:
model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train[train["fold"] != fold][features]
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold][features]
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)
        params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.048,
            'max_depth': 3, 
            'lambda_l1': 0.0,
            'lambda_l2': 0.011
        }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                               lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        print(lgb.plot_importance(model,max_num_features=30))
        models.append(model)
    model_dict[target] = models

In [None]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold][features]
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

In [None]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test[features]

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [None]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

In [None]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)