## Sources
- Regressor idea: https://www.kaggle.com/code/tsunotsuno/updated-debertav3-lgbm-with-feature-engineering
### Previous notebook:
- https://www.kaggle.com/code/josemariasabater/commonlit-roberta-base-with-prompts/edit/run/139550119

In [1]:
!pip3 install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


## Imports and Settings

In [2]:
import numpy as np
import pandas as pd
import warnings
import os
import shutil
import logging
import json
import transformers
from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
)
from transformers import DataCollatorWithPadding
from datasets import Dataset, load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch

from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from collections import Counter
import spacy
import pandas as pd
from spellchecker import SpellChecker
import re
import string
import xgboost as xgb
# %load_ext lab_black

# logging settings

# warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
# disable_progress_bar()
tqdm.pandas()

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
# set random seed
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(seed=42)

### Config class

In [4]:
class CFG:
    # model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
    # model_name = "/kaggle/input/debertav3base"
    model_name = "debertav3base"
    learning_rate = 1.2e-5
    weight_decay = 0.02
    hidden_dropout_prob = 0.1
    attention_probs_dropout_prob = 0.01
    num_train_epochs = 3
    n_splits = 4
    batch_size = 8
    random_seed = 42
    save_steps = 100
    max_length = 512
    use_prompts = False
    warmup_ratio = 0.01

## Load Data

In [5]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

# Local

#DATA_DIR = "./data/"

#prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
#prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
#summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
#summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
#sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

In [6]:
# For testing
random_nr = np.random.randint(0, len(prompts_train))
merged_train = pd.merge(summaries_train, prompts_train, how="left", on="prompt_id")
example1 = merged_train.iloc[random_nr].copy()
example1

student_id                                              004e978e639e
prompt_id                                                     3b9047
text               In Egypt, there were many occupations and soci...
content                                                     3.128928
wording                                                     4.231226
prompt_question    In complete sentences, summarize the structure...
prompt_title                               Egyptian Social Structure
prompt_text        Egyptian society was structured like a pyramid...
Name: 2, dtype: object

## Preprocessing Class

### Ideas
Overlaps
Quotes
Length of Summary vs Length of text
Grammar mistakes
Repeated vocabulary inside the summary

In [7]:
class Preprocessor:
    def __init__(self, model_name: str) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{model_name}")
        # Spacy NER count
        self.spacy_ner_model = spacy.load(
            "en_core_web_sm",
        )
        self.speller = SpellChecker()
        self.STOP_WORDS = set(stopwords.words("english"))
        self.vectorizer = CountVectorizer

    def count_text_length(self, df: pd.DataFrame, column: str) -> pd.Series:
        return df[column].progress_apply(lambda x: len(self.tokenizer.encode(x)))

    def non_stop_word_overlap(self, row: pd.Series) -> float:
        """intersection(prompt_text, text) after removing stop words"""

        def check_is_stop_word(word):
            normalized_word = word.lower().strip("▁")
            return (
                normalized_word not in self.STOP_WORDS
                and normalized_word not in string.punctuation
            )

        prompt_words = row["prompt_tokens"]
        summary_words = row["summary_tokens"]
        # Remove stop words
        prompt_words = list(filter(check_is_stop_word, prompt_words))
        summary_words = list(filter(check_is_stop_word, summary_words))

        return len(set(prompt_words).intersection(set(summary_words)))

    def ngrams(self, input_list: list, n: int) -> list[str]:
        """Returns a list of ngrams"""
        ngrams_ = zip(*[input_list[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams_]

    def get_ngram_overlap(self, row: pd.Series, n: int) -> float:
        """Returns the ngram overlap between prompt and summary"""
        summary_ngrams = self.ngrams(row["summary_tokens"], n)
        prompt_ngrams = self.ngrams(row["prompt_tokens"], n)

        return len(set(summary_ngrams).intersection(set(prompt_ngrams)))

    def get_ner_overlap(self, row: pd.Series) -> float:
        """Returns the number of overlapping named entities between prompt and summary"""
        prompt_doc = self.spacy_ner_model(row["prompt_text"])
        summary_doc = self.spacy_ner_model(row["text"])

        prompt_entities = set([ent.text.lower() for ent in prompt_doc.ents])
        summary_entities = set([ent.text.lower() for ent in summary_doc.ents])

        return len(prompt_entities.intersection(summary_entities))

    def get_spelling_error_count(self, row: pd.Series) -> float:
        """Returns the number of spelling errors in the summary"""
        summary_text = row["text"]
        text = "".join(char for char in summary_text if char not in string.punctuation)
        misspelled = self.speller.unknown(text.split())
        return len(misspelled)
    
    def get_cosine_similarity(self, row: pd.Series) -> float:
        summary_text = row["text"]
        prompt_text = row["prompt_text"]
        all_text = [summary_text, prompt_text]
        self.vectorizer.fit(all_text)

        vec1 = self.vectorizer.transform([summary_text]).toarray()
        vec2 = self.vectorizer.transform([prompt_text]).toarray()
        return cosine_similarity(vec1, vec2)[0][0]

    def run(self, prompts: pd.DataFrame, summaries: pd.DataFrame) -> pd.DataFrame:
        # Tokenize

        tqdm.pandas(desc="Tokenizing Prompts")
        prompts["prompt_tokens"] = prompts["prompt_text"].progress_apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), skip_special_tokens=True
            )
        )
        tqdm.pandas(desc="Tokenizing Summaries")
        summaries["summary_tokens"] = summaries["text"].progress_apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), skip_special_tokens=True
            )
        )

        merged_df = pd.merge(summaries, prompts, how="left", on="prompt_id")

        # Count text length

        merged_df["prompt_length"] = self.count_text_length(merged_df, "prompt_text")
        merged_df["summary_length"] = self.count_text_length(merged_df, "text")

        # Count non-stop word overlap
        tqdm.pandas(desc="Counting non-stop word overlap")
        merged_df["non_stop_word_overlap"] = merged_df.progress_apply(
            self.non_stop_word_overlap, axis=1
        )

        # Count ngram overlap
        tqdm.pandas(desc="Counting unigram overlap")
        merged_df["unigram_overlap"] = merged_df.progress_apply(
            lambda x: self.get_ngram_overlap(x, 1), axis=1
        )
        tqdm.pandas(desc="Counting bigram overlap")
        merged_df["bigram_overlap"] = merged_df.progress_apply(
            lambda x: self.get_ngram_overlap(x, 2), axis=1
        )
        tqdm.pandas(desc="Counting trigram overlap")
        merged_df["trigram_overlap"] = merged_df.progress_apply(
            lambda x: self.get_ngram_overlap(x, 3), axis=1
        )

        # Count named entity overlap
        tqdm.pandas(desc="Counting named entity overlap")
        merged_df["ner_overlap"] = merged_df.progress_apply(
            self.get_ner_overlap, axis=1
        )

        # Count spelling errors
        tqdm.pandas(desc="Counting spelling errors")
        merged_df["spelling_error_count"] = merged_df.progress_apply(
            self.get_spelling_error_count, axis=1
        )

        # Summary/Prompt token length ratio
        merged_df["token_length_ratio"] = (
            merged_df["summary_length"] / merged_df["prompt_length"]
        )
        tqdm.pandas(desc="Calculating cosine similarity")
        merged_df["cosine_similarity"] = merged_df.progress_apply(
        self.get_cosine_similarity, axis=1)
        
        return merged_df
    

Preprocessor = Preprocessor(CFG.model_name)



In [8]:
train = Preprocessor.run(prompts_train, summaries_train)
test = Preprocessor.run(prompts_test, summaries_test)

Tokenizing Prompts: 100%|██████████| 4/4 [00:00<00:00, 59.21it/s]
Tokenizing Summaries: 100%|██████████| 7165/7165 [00:12<00:00, 577.13it/s]
Tokenizing Summaries: 100%|██████████| 7165/7165 [00:16<00:00, 441.12it/s]
Tokenizing Summaries: 100%|██████████| 7165/7165 [00:02<00:00, 3485.48it/s]
Counting non-stop word overlap: 100%|██████████| 7165/7165 [00:04<00:00, 1641.89it/s]
Counting unigram overlap: 100%|██████████| 7165/7165 [00:01<00:00, 5921.58it/s]
Counting bigram overlap: 100%|██████████| 7165/7165 [00:02<00:00, 3207.48it/s]
Counting trigram overlap: 100%|██████████| 7165/7165 [00:02<00:00, 2810.89it/s]
Counting named entity overlap: 100%|██████████| 7165/7165 [16:52<00:00,  7.07it/s]
Counting spelling errors: 100%|██████████| 7165/7165 [00:02<00:00, 3496.47it/s]
Calculating cosine similarity:   0%|          | 1/7165 [00:00<00:40, 178.19it/s]


In [None]:
train.head()

### Metrics functions

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}


def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }


def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred) ** (1 / 2)
    wording_score = mean_squared_error(wording_true, wording_pred) ** (1 / 2)

    return (content_score + wording_score) / 2

## Train LLM

In [None]:
class SummaryRegressor:
    def __init__(
        self,
        model_name: str,
        model_dir: str,
        target: list[str],
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        max_length: int,
    ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "text"]
        self.input_col = "input"

        self.target = target
        self.target_cols = target

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length

        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{model_name}")
        self.model_config = AutoConfig.from_pretrained(f"/kaggle/input/{model_name}")
        #self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        #self.model_config = AutoConfig.from_pretrained(model_name)
        self.model_config.update(
            {
                "hidden_dropout_prob": CFG.hidden_dropout_prob,
                "attention_probs_dropout_prob": CFG.attention_probs_dropout_prob,
                "num_labels": 2,
                "problem_type": "regression",
            }
        )
        self.data_collator = DataCollatorWithPadding(self.tokenizer)

        seed_everything(seed=42)
        print(self.target[0])

    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples["content"], examples["wording"]]
        tokenized = self.tokenizer(
            examples[self.input_col],
            padding=False,
            truncation=True,
            max_length=self.max_length,
        )
        return {
            **tokenized,
            "labels": labels,
        }

    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(
            examples[self.input_col],
            padding=False,
            truncation=True,
            max_length=self.max_length,
        )
        return tokenized

    def train(
        self,
        fold: int,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        batch_size: int,
        learning_rate: float,
        weight_decay: float,
        num_train_epochs: float,
        save_steps: int,
    ) -> None:
        sep = self.tokenizer.sep_token
        train_df[self.input_col] = (
            train_df["prompt_title"]
            + sep
            + train_df["prompt_question"]
            + sep
            + train_df["text"]
        )
        val_df[self.input_col] = (
            val_df["prompt_title"]
            + sep
            + val_df["prompt_question"]
            + sep
            + val_df["text"]
        )

        train_df = train_df[[self.input_col] + self.target_cols]
        val_df = val_df[[self.input_col] + self.target_cols]

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
        val_dataset = Dataset.from_pandas(val_df, preserve_index=False)

        train_dataset_tokenized = train_dataset.map(
            self.tokenize_function, batched=False
        )
        val_dataset_tokenized = val_dataset.map(self.tokenize_function, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"/kaggle/input/{self.model_name}",
            config=self.model_config,
        )

        model_fold_dir = f"{self.model_dir}/fold_{fold}"

        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            save_strategy="steps",
            save_steps=save_steps,
            save_total_limit=1,
            load_best_model_at_end=True,
            metric_for_best_model="rmse",
            greater_is_better=False,
            evaluation_strategy="steps",
            eval_steps=save_steps,
            # disable_tqdm=True,
            report_to="none",
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_dataset_tokenized,
            eval_dataset=val_dataset_tokenized,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator,
        )

        trainer.train()

        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

    def predict(
        self,
        test_df: pd.DataFrame,
        fold: int,
    ) -> np.array:
        sep = self.tokenizer.sep_token
        test_df[self.input_col] = (
            test_df["prompt_title"]
            + sep
            + test_df["prompt_question"]
            + sep
            + test_df["text"]
        )
        test_df = test_df[[self.input_col]]
        test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
        test_dataset_tokenized = test_dataset.map(
            self.tokenize_function_test, batched=False
        )
        model_fold_dir = f"{self.model_dir}/fold_{fold}"
        model_dir = f"{self.model_name}/fold_{fold}"

        model_content = AutoModelForSequenceClassification.from_pretrained(
            self.model_dir
        )
        model_content.eval()

        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train=False,
            do_predict=True,
            per_device_eval_batch_size=4,
            dataloader_drop_last=False,
            # disable_tqdm=True,
        )

        infer_trainer = Trainer(
            model=model_content,
            args=test_args,
            data_collator=self.data_collator,
        )

        predictions = infer_trainer.predict(test_dataset_tokenized)

        return predictions

In [None]:
# Create folds

gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

train.head()

## Cross Validation

In [None]:
def train_folds(
    train_df: pd.DataFrame,
    model_name: str,
    target: list[str],
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length: int,
    batch_size: int,
    learning_rate: float,
    weight_decay: float,
    num_train_epochs: float,
    save_steps: int,
    n_splits: int,
    save_each_model: bool,
):
    if os.path.exists(f"{model_name}"):
        try:
            shutil.rmtree(model_name)
        except:
            pass
    os.mkdir(model_name)

    for fold in range(n_splits):
        print(f"Training fold {fold}")

        train_data = train_df[train_df["fold"] != fold]
        val_data = train_df[train_df["fold"] == fold]

        if save_each_model:
            model_dir = f"{model_name}/fold_{fold}"
        else:
            model_dir = f"{model_name}/fold_{fold}"

        model = SummaryRegressor(
            model_name=model_name,
            model_dir=model_dir,
            target=target,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
        )

        model.train(
            fold=fold,
            train_df=train_data,
            val_df=val_data,
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )


def validate(
    train_df: pd.DataFrame,
    target: list[str],
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length: int,
) -> pd.DataFrame:
    for fold in range(CFG.n_splits):
        if save_each_model:
            model_dir = f"{model_name}/fold_{fold}"
        else:
            model_dir = f"{model_name}/fold_{fold}"
        val_data = train_df[train_df["fold"] == fold]
        print("model name:", model_name)
        print("model dir", model_dir)
        model = SummaryRegressor(
            model_name=model_name,
            model_dir=model_dir,
            target=target,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
        )

        predictions = model.predict(test_df=val_data, fold=fold)

        train_df.loc[val_data.index, "content_pred"] = predictions.predictions[:, 0]
        train_df.loc[val_data.index, "wording_pred"] = predictions.predictions[:, 1]

    return train_df


def predict(
    test_df: pd.DataFrame,
    target: list[str],
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length: int,
) -> pd.DataFrame:
    for fold in range(CFG.n_splits):
        if save_each_model:
            model_dir = f"{model_name}/fold_{fold}"
        else:
            model_dir = f"{model_name}/fold_{fold}"
        model = SummaryRegressor(
            model_name=model_name,
            model_dir=model_dir,
            target=target,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
        )

        predictions = model.predict(test_df=test_df, fold=fold)

        test_df.loc[:, f"content_pred_{fold}"] = predictions.predictions[:, 0]
        test_df.loc[:, f"wording_pred_{fold}"] = predictions.predictions[:, 1]
    
    test_df["content_pred"] = test_df[[f"content_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)
    test_df["wording_pred"] = test_df[[f"wording_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [None]:
target = ["content", "wording"]
train_folds(
    train_df=train,
    model_name=CFG.model_name,
    target=target,
    hidden_dropout_prob=CFG.hidden_dropout_prob,
    attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
    max_length=CFG.max_length,
    batch_size=CFG.batch_size,
    learning_rate=CFG.learning_rate,
    weight_decay=CFG.weight_decay,
    num_train_epochs=CFG.num_train_epochs,
    save_steps=CFG.save_steps,
    n_splits=CFG.n_splits,
    save_each_model=False,
)

train = validate(
    train_df=train,
    target=target,
    save_each_model=False,
    model_name=CFG.model_name,
    hidden_dropout_prob=CFG.hidden_dropout_prob,
    attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
    max_length=CFG.max_length,
)

# print the score
print(
    f"Score: {compt_score(train.content, train.content_pred, train.wording, train.wording_pred)}"
)

test = predict(
    test_df=test,
    target=target,
    save_each_model=False,
    model_name=CFG.model_name,
    hidden_dropout_prob=CFG.hidden_dropout_prob,
    attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
    max_length=CFG.max_length,
)

In [None]:
train.head()

In [None]:
train.to_csv("df_after_llm.csvv")

## Train XGBoost

In [None]:
target = ["content", "wording"]

features = ["content_pred",
            "wording_pred",
            "token_length_ratio",
            "non_stop_word_overlap",
            "unigram_overlap",
            "bigram_overlap", 
            "trigram_overlap",
            "ner_overlap",
            "spelling_error_count",
            "summary_length",
            "cosine_similarity"]
models_dict = {t: [] for t in target}
for t in target:
    for fold in range(CFG.n_splits):
        X_train = train[train["fold"] != fold][features]
        Y_train = train[train["fold"] != fold][t]
        X_val = train[train["fold"] == fold][features]
        Y_val = train[train["fold"] == fold][t]

        dtrain = xgb.DMatrix(X_train, label=Y_train)
        dval = xgb.DMatrix(X_val, label=Y_val)

        params = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "seed": 42,
            "learning_rate": 0.01,
            "booster": "gbtree"
        }
        evaluation_results = {}

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=10000,
            evals=[(dtrain, "train"), (dval, "val")],
            verbose_eval=100,
            early_stopping_rounds=100,
            evals_result=evaluation_results,
        )
        models_dict[t].append(model)

## Feature importance

In [None]:
import matplotlib.pyplot as plt

for t, models in models_dict.items():
    for fold, model in enumerate(models):
        feature_importance_dict = model.get_score(importance_type='weight')
        
        # Sorting by importance
        sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
        features_, importances = zip(*sorted_features)
        
        plt.figure(figsize=(12, 6))
        plt.barh(features_, importances)
        plt.xlabel('Importance')
        plt.ylabel('Features')
        plt.title(f'Feature Importances for target {t} - Fold {fold + 1}')
        plt.show()


## CV Score


In [None]:
rmses = []

for t in target:
    models = models_dict[t]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold][features]
        y_eval_cv = train[train["fold"] == fold][t]
        dmatrix_eval_cv = xgb.DMatrix(X_eval_cv)
        pred = model.predict(dmatrix_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

In [None]:
pred_dict = {}
for t in target:
    models = models_dict[t]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test[features]
        dmatrix_eval_cv = xgb.DMatrix(X_eval_cv)
        pred = model.predict(dmatrix_eval_cv)
        preds.append(pred)
    
    pred_dict[t] = preds

In [None]:
for t in target:
    preds = pred_dict[t]
    for i, pred in enumerate(preds):
        test[f"{t}_pred_{i}"] = pred

    test[t] = test[[f"{t}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

In [None]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)