In [None]:
!pip install "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"
!pip install /kaggle/input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl
!pip install /kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl

In [None]:
# for new feat from feedback
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

import nltk
from nltk.tokenize import word_tokenize

# old 
import os

import textstat
import numpy as np
import pandas as pd

In [None]:
class CFG1:
    model = "microsoft/deberta-v3-base"
    path = "../input/0911-deberta-v3-base/"
    base = "../input/fb3models/microsoft-deberta-v3-base/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=24
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers= 0
    weight = 1.0

class CFG8:
    model = "microsoft/deberta-v3-large"
    path = "../input/0925-deberta-v3-large-unscale/"
    base = "../input/fb3models/microsoft-deberta-v3-large/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size= 16
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers= 0 
    weight = 1.0 # was 1.2

CFG_list = [CFG1, CFG8]

In [None]:
def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [None]:
# FROM OLD 
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# ====================================================
# oof
# ====================================================
for CFG in CFG_list:
    oof_df = pd.read_pickle(CFG.path+'oof_df.pkl')
    labels = oof_df[CFG.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
    score, scores = get_score(labels, preds)
    LOGGER.info(f'Model: {CFG.model} Score: {score:<.4f}  Scores: {scores}')
    
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        #max_length=CFG.max_len,
        #pad_to_max_length=True,
        #truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

# new for CL
class CommonlitDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs
    
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim = 1)
        return max_embeddings
    
class MinPooling(nn.Module):
    def __init__(self):
        super(MinPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e-4
        min_embeddings, _ = torch.min(embeddings, dim = 1)
        return min_embeddings
        

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = AutoConfig.from_pretrained(config_path, output_hidden_states=True)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in test_loader:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
prompts_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
prompts_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")

summaries_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
summaries_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")

sample_submission = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")

train = summaries_train.merge(prompts_train, how="left", on="prompt_id")
test = summaries_test.merge(prompts_test, how="left", on="prompt_id")

display(prompts_train, train, test)

Let's create some features using text statistics and use classic ML algorithm

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/sentence-transformers/minilm-l6-v2/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('/kaggle/input/sentence-transformers/minilm-l6-v2/all-MiniLM-L6-v2')
MEMORY = {}

def get_emb(sentences):
    if sentences in MEMORY:
        return MEMORY[sentences]
    # Tokenize sentences
    encoded_input = tokenizer([sentences], padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)[0].detach().cpu().numpy()
    MEMORY[sentences] = sentence_embeddings
    
    return sentence_embeddings

STOPWORDS = pd.read_csv("/kaggle/input/nltk-english-stopwords/nltk_eng_stopwords.csv")["list_of_stopwords"].tolist()
def get_stopwords_rel(text):
    text_words = word_tokenize(text)
    num_stopwords = sum([word in STOPWORDS for word in text_words])
    
    return num_stopwords/len(text_words)

In [None]:
def get_feedback_feat(data):
    to_df = np.zeros((data.shape[0], len(CFG1.target_cols)))
    for _idx, CFG in enumerate(CFG_list):
        dataset = CommonlitDataset(CFG, data)
        loader = DataLoader(dataset,
                            batch_size=CFG.batch_size,
                            shuffle=False,
                            collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                            num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

        predictions = []
        for fold in CFG.trn_fold:
            print('='*10, f'started fold {fold}', '='*10)
            model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
            state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                               map_location=torch.device('cpu'))
            model.load_state_dict(state['model'])
            prediction = inference_fn(loader, model, device)
            predictions.append(prediction)
            del prediction
            torch.cuda.empty_cache()
        to_df += np.mean(predictions, axis=0) / len(CFG_list)
        del model, dataset, loader; gc.collect()
        torch.cuda.empty_cache() 
        
    data[CFG.target_cols] = to_df
    del predictions
    return data

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

def get_stat_features(df, text_col="text"):
    df["num_unique_words"] = df[text_col].apply(lambda x: len(set(x.split())))
    df["num_words"] = df[text_col].apply(lambda x: len(x.split()))
    df["num_sentences"] = df[text_col].apply(lambda x: len(x.split('.')))
    df["isupper"] = df[text_col].apply(lambda x: x[0].isupper())
    df["mean_num_words"] = df[text_col].apply(lambda x: np.mean([len(e.split()) for e in x.split('.')]))
    df["mean_num_unique_words"] = df[text_col].apply(lambda x: np.mean([len(set(e.split())) for e in x.split('.')]))
    df["num_slash"] = df[text_col].apply(lambda x: x.count("\n"))
    df["paragraph_count"] = train[text_col].apply(lambda x: x.count("\n\n"))
    df["upper_count"] = df[text_col].apply(lambda x: np.sum([w.isupper() for w in x.split()])/len(x.split()))
    df["syntax_count"] = df[text_col].apply(lambda x: x.count(",") + x.count("-") + x.count(";") + x.count(":"))
    
    df['diff_emb'] = df.progress_apply(lambda x: 1 - np.sum(get_emb(x["text"])*get_emb(x["prompt_text"])), axis=1)
    
    df['automated_readability_index'] = df[text_col].progress_apply(lambda x: textstat.automated_readability_index(x))
    df['coleman_liau_index'] = df[text_col].progress_apply(lambda x: textstat.coleman_liau_index(x))
    df['smog_index'] = df[text_col].progress_apply(lambda x: textstat.smog_index(x))
    
    df['dale_chall_readability_score'] = df[text_col].progress_apply(lambda x: textstat.dale_chall_readability_score(x))
    df['linsear_write_formula'] = df[text_col].progress_apply(lambda x: textstat.linsear_write_formula(x))
    df['gunning_fog'] = df[text_col].progress_apply(lambda x: textstat.gunning_fog(x))
    df['text_standard_float'] = df[text_col].progress_apply(lambda x: textstat.text_standard(x, float_output=True))
    df['spache_readability'] = df[text_col].progress_apply(lambda x: textstat.spache_readability(x))
    df['rix'] = df[text_col].progress_apply(lambda x: textstat.rix(x))
    df['lix'] = df[text_col].progress_apply(lambda x: textstat.lix(x))
    
    # new features 
    df["stopwords_rel"] = df[text_col].progress_apply(lambda x: get_stopwords_rel(x))
    
    return df
    
train = get_stat_features(train)
test = get_stat_features(test)

train = get_feedback_feat(train)
test = get_feedback_feat(test)

Let's see corr matrix:

In [None]:
NO_FEATURES = ["student_id", "prompt_id", "prompt_question", "prompt_title", "prompt_text"]
TARGETS = ["content", "wording"]
FEATURES = [col for col in train.columns if col not in NO_FEATURES + TARGETS]

corr = train[FEATURES + TARGETS].corr()
corr.style.background_gradient(cmap='coolwarm')

Learning catboost and check metric:

In [None]:
from sklearn.model_selection import GroupKFold
from catboost import CatBoostRegressor

models = []
gfk = GroupKFold(n_splits=4)
train_oof = np.zeros((len(train), 2))
test_pred = np.zeros((len(test), 2))
X, y = train[FEATURES], train[TARGETS]
for train_index, val_index in gfk.split(train, groups=train["prompt_id"]):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[val_index], y.iloc[val_index]
    
    model = CatBoostRegressor(random_state=42, max_depth=4, 
                              objective="MultiRMSE", text_features=["text"])
    model.fit(X_train, y_train, eval_set=(X_val, y_val), metric_period=100)
    models.append(model)
    
    train_oof[val_index] = model.predict(X_val)
    test_pred +=  model.predict(test[FEATURES])/4

Checking competition metric:

In [None]:
compute_mcrmse((train_oof, train[TARGETS]))

In [None]:
# baseline version 4 + CFG1 - {'content_rmse': 0.5228984773032009, 'wording_rmse': 0.762959620336589, 'mcrmse': 0.642929048819895} - LB 0.536
# baseline version 5 + CFG8 - {'content_rmse': 0.5337838213526346, 'wording_rmse': 0.7754182781358363, 'mcrmse': 0.6546010497442354} - LB 0.542

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_importance(df, best_model, height, top_n=50):

    top_n = min(top_n, df.shape[1])

    # Извлекаем значения из модели
    fi = pd.DataFrame(index=df.columns, columns=[])
    for i, m in enumerate(best_model):
        fi[f"m_{i}"] = m.feature_importances_

    fi = fi.stack().reset_index().iloc[:, [0, 2]]
    fi.columns = ["feature", "importance"]

    # Определяем порядок признаков и отбираем только n признаков для отрисовки
    cols_ord = (
        fi.groupby("feature")["importance"]
        .mean()
        .sort_values(ascending=False)
        .index.tolist()[:top_n]
    )

    fi = fi[fi["feature"].isin(cols_ord)]  # Выравниваем порядок по убыванию важности
    print( "Всего признаков {} Усреднее по {}-ти моделям: ".format(len(cols_ord), len(best_model)))

    # Отрисовываем боксплоты фичей
    plt.figure(figsize=(10, len(cols_ord) * height))
    b = sns.boxplot(data=fi, y="feature", x="importance", orient="h", order=cols_ord)

    print("На график нанесено топ-{} признаков".format(top_n))
    return (
        fi.groupby(by=["feature"], as_index=False)["importance"]
        .mean()
        .sort_values(by="importance", ascending=False)
    )


df_feats_imp = plot_importance(
    train[FEATURES],
    models,
    0.7,
    top_n=40,
)

Not bad!We need submit it!

In [None]:
sample_submission["content"] = test_pred[:, 0]
sample_submission["wording"] = test_pred[:, 1]

sample_submission.to_csv("submission.csv", index=False)

The main idea of this notebook, inspire comunity to not only train transformers, but to find new interesting solutions!