In [None]:
# !pip install ".inputautocorrect/autocorrect-2.6.1.tar"
# !pip install ".inputpyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

In [None]:
# nltk.download("punkt")

In [None]:
from typing import List
import numpy as np
import pandas as pd
import warnings
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
import lightgbm as lgb
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [None]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True # load seed
    
seed_everything(seed=42)

## Class CFG

In [None]:
class CFG:
    model_name="debertav3base"
    learning_rate=0.000016
    weight_decay=0.03
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=5
    n_splits=4
    batch_size= 128
    random_seed=42
    save_steps=100
    max_length= 512
    number_base_model = 2
    test_mode = False
    device = 'CPU'
    infer_mode = True
    list_model_infer = [
        # 'upload_model/debertav3base_lr15e-05',
        'upload_model/debertav3base_lr17e-05', #keep
        'upload_model/debertav3base_lr18e-05', #keep
        'upload_model/debertav3base_lr21e-05', # keep
        'upload_model/debertav3base_lr22e-05', #keep 
        # 'upload_model/debertav3base_lr5e-05', 
        'upload_model/debertav3large_lr12e-05', # upload
        'upload_model/debertav3large_lr13e-05',  # upload
        # 'debertav3large_lr1e-05_save',
        # 'debertav3large_lr1e-05_att_0007',
        'debertav3large_lr8e-06_att_0007', # upload 
        'debertav3large_lr9e-06_att_0007', # upload 
        'debertav3large_lr11e-05_att_0007',
        'debertav3large_lr12e-05_att_0007',
        'debertav3large_lr13e-05_att_0007',
        'debertav3large_lr14e-05_att_0007',
        'debertav3large_lr15e-05_att_0007', # upload 
        'debertav3large_lr16e-05_att_0007', # upload 
        'debertav3large_lr17e-05_att_0007', # upload 
        'debertav3large_lr18e-05_att_0007', # upload 
        ]
    

In [None]:
# print device
if CFG.device != 'CPU':
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # print device 
else :
    device = torch.device("cpu")
print(device)

cpu


## Dataload

In [None]:
DATA_DIR = "input/commonlit-evaluate-student-summaries/"

# prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
# summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

## Exploratory Data Analysis

In [None]:
# prompts_train.head()

## Preprocess 2


In [None]:
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import ne_chunk, word_tokenize, pos_tag
from bs4 import BeautifulSoup

# nltk.downloader.download('vader_lexicon')
import pyphen
from nltk.sentiment import SentimentIntensityAnalyzer

dic = pyphen.Pyphen(lang='en')
sid = SentimentIntensityAnalyzer()

class Preprocessor2:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(f"input/{model_name}")
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def calculate_text_similarity(self, row):
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([row['prompt_text'], row['text']])
        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2]).flatten()[0]
    
    def sentiment_analysis(self, text):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity, analysis.sentiment.subjectivity
    
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def calculate_unique_words(self,text):
        unique_words = set(text.split())
        return len(unique_words)
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
        
    def calculate_pos_ratios(self , text):
        pos_tags = pos_tag(nltk.word_tokenize(text))
        pos_counts = Counter(tag for word, tag in pos_tags)
        total_words = len(pos_tags)
        ratios = {tag: count / total_words for tag, count in pos_counts.items()}
        return ratios
    
    def calculate_punctuation_ratios(self,text):
        total_chars = len(text)
        punctuation_counts = Counter(char for char in text if char in '.,!?;:"()[]{}')
        ratios = {char: count / total_chars for char, count in punctuation_counts.items()}
        return ratios
    
    def calculate_keyword_density(self,row):
        keywords = set(row['prompt_text'].split())
        text_words = row['text'].split()
        keyword_count = sum(1 for word in text_words if word in keywords)
        return keyword_count / len(text_words)
    
    def count_syllables(self,word):
        hyphenated_word = dic.inserted(word)
        return len(hyphenated_word.split('-'))

    def flesch_reading_ease_manual(self,text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        total_syllables = sum(self.count_syllables(word) for word in TextBlob(text).words)

        if total_sentences == 0 or total_words == 0:
            return 0

        flesch_score = 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (total_syllables / total_words)
        return flesch_score
    
    def flesch_kincaid_grade_level(self, text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        total_syllables = sum(self.count_syllables(word) for word in TextBlob(text).words)

        if total_sentences == 0 or total_words == 0:
            return 0

        fk_grade = 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59
        return fk_grade
    
    def gunning_fog(self, text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        complex_words = sum(1 for word in TextBlob(text).words if self.count_syllables(word) > 2)

        if total_sentences == 0 or total_words == 0:
            return 0

        fog_index = 0.4 * ((total_words / total_sentences) + 100 * (complex_words / total_words))
        return fog_index
    
    def calculate_sentiment_scores(self,text):
        sentiment_scores = sid.polarity_scores(text)
        return sentiment_scores
    
    def count_difficult_words(self, text, syllable_threshold=3):
        words = TextBlob(text).words
        difficult_words_count = sum(1 for word in words if self.count_syllables(word) >= syllable_threshold)
        return difficult_words_count

    def text_cleaning(self, text):
        '''
        Cleans text into a basic form for NLP. Operations include the following:-
        1. Remove special charecters like &, #, etc
        2. Removes extra spaces
        3. Removes embedded URL links
        4. Removes HTML tags
        5. Removes emojis

        text - Text piece to be cleaned.
        '''
        template = re.compile(r'https?://\S+|www\.\S+')  # Removes website links
        text = template.sub(r'', text)

        soup = BeautifulSoup(text, 'lxml')  # Removes HTML tags
        only_text = soup.get_text()
        text = only_text

        emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)

        text = re.sub(r"[^a-zA-Z\d]", " ", text) # Remove special Charecters
        text = re.sub('\n+', '\n', text) 
        text = re.sub('\.+', '.', text) 
        text = re.sub(' +', ' ', text) # Remove Extra Spaces 

        return text
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
        prompts['gunning_fog_prompt'] = prompts['prompt_text'].apply(self.gunning_fog)
        prompts['flesch_kincaid_grade_level_prompt'] = prompts['prompt_text'].apply(self.flesch_kincaid_grade_level)
        prompts['flesch_reading_ease_prompt'] = prompts['prompt_text'].apply(self.flesch_reading_ease_manual)

        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.text_cleaning(x)
        )
        summaries["fixed_summary_text"] = summaries["fixed_summary_text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")
        input_df['flesch_reading_ease'] = input_df['text'].apply(self.flesch_reading_ease_manual)
        input_df['word_count'] = input_df['text'].apply(lambda x: len(x.split()))
        input_df['sentence_length'] = input_df['text'].apply(lambda x: len(x.split('.')))
        input_df['vocabulary_richness'] = input_df['text'].apply(lambda x: len(set(x.split())))

        input_df['word_count2'] = [len(t.split(' ')) for t in input_df.text]
        input_df['num_unq_words']=[len(list(set(x.lower().split(' ')))) for x in input_df.text]
        input_df['num_chars']= [len(x) for x in input_df.text]

        # Additional features
        input_df['avg_word_length'] = input_df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]))
        input_df['comma_count'] = input_df['text'].apply(lambda x: x.count(','))
        input_df['semicolon_count'] = input_df['text'].apply(lambda x: x.count(';'))

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        input_df['exclamation_count'] = input_df['text'].apply(lambda x: x.count('!'))
        input_df['question_count'] = input_df['text'].apply(lambda x: x.count('?'))
        input_df['pos_ratios'] = input_df['text'].apply(self.calculate_pos_ratios)

        # Convert the dictionary of POS ratios into a single value (mean)
        input_df['pos_mean'] = input_df['pos_ratios'].apply(lambda x: np.mean(list(x.values())))
        input_df['punctuation_ratios'] = input_df['text'].apply(self.calculate_punctuation_ratios)

        # Convert the dictionary of punctuation ratios into a single value (sum)
        input_df['punctuation_sum'] = input_df['punctuation_ratios'].apply(lambda x: np.sum(list(x.values())))
        input_df['keyword_density'] = input_df.apply(self.calculate_keyword_density, axis=1)
        input_df['jaccard_similarity'] = input_df.apply(lambda row: len(set(word_tokenize(row['prompt_text'])) & set(word_tokenize(row['text']))) / len(set(word_tokenize(row['prompt_text'])) | set(word_tokenize(row['text']))), axis=1)
        tqdm.pandas(desc="Performing Sentiment Analysis")
        input_df[['sentiment_polarity', 'sentiment_subjectivity']] = input_df['text'].progress_apply(
            lambda x: pd.Series(self.sentiment_analysis(x))
        )
        tqdm.pandas(desc="Calculating Text Similarity")
        input_df['text_similarity'] = input_df.progress_apply(self.calculate_text_similarity, axis=1)
        #Calculate sentiment scores for each row
        input_df['sentiment_scores'] = input_df['text'].apply(self.calculate_sentiment_scores)
        
        input_df['gunning_fog'] = input_df['text'].apply(self.gunning_fog)
        input_df['flesch_kincaid_grade_level'] = input_df['text'].apply(self.flesch_kincaid_grade_level)
        input_df['count_difficult_words'] = input_df['text'].apply(self.count_difficult_words)

        # Convert sentiment_scores into individual columns
        sentiment_columns = pd.DataFrame(list(input_df['sentiment_scores']))
        input_df = pd.concat([input_df, sentiment_columns], axis=1)
        input_df['sentiment_scores_prompt'] = input_df['prompt_text'].apply(self.calculate_sentiment_scores)
        # Convert sentiment_scores_prompt into individual columns
        sentiment_columns_prompt = pd.DataFrame(list(input_df['sentiment_scores_prompt']))
        sentiment_columns_prompt.columns = [col +'_prompt' for col in sentiment_columns_prompt.columns]
        input_df = pd.concat([input_df, sentiment_columns_prompt], axis=1)
        columns =  ['pos_ratios', 'sentiment_scores', 'punctuation_ratios', 'sentiment_scores_prompt']
        cols_to_drop = [col for col in columns if col in input_df.columns]
        if cols_to_drop:
            input_df = input_df.drop(columns=cols_to_drop)
        
        print(cols_to_drop)
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    

In [None]:
preprocessor = Preprocessor2(model_name=CFG.model_name)

## Create the train and test sets


In [None]:
if CFG.test_mode : 
    # prompts_train = prompts_train[:12]
    prompts_test = prompts_test[:12]
    # summaries_train = summaries_train[:12]
    summaries_test = summaries_test[:12]

In [None]:
# train = preprocessor.run(prompts_train, summaries_train, mode="train")
test = preprocessor.run(prompts_test, summaries_test, mode="test")
# save train and test 
# train.to_csv("input/train.csv", index=False)
# test.to_csv("input/test.csv", index=False)
# load train and test
# train = pd.read_csv("input/train.csv")
# test = pd.read_csv("input/test.csv")
# train.head()

100%|██████████| 4/4 [00:00<00:00, 1374.28it/s]
100%|██████████| 4/4 [00:00<00:00, 23399.19it/s]
100%|██████████| 4/4 [00:00<00:00, 29279.61it/s]
100%|██████████| 4/4 [00:00<00:00, 6269.51it/s]
100%|██████████| 4/4 [00:00<00:00, 10761.52it/s]
100%|██████████| 4/4 [00:00<00:00, 9597.95it/s]
100%|██████████| 4/4 [00:00<00:00, 8603.70it/s]
Performing Sentiment Analysis: 100%|██████████| 4/4 [00:00<00:00, 5023.12it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 785.23it/s]

['pos_ratios', 'sentiment_scores', 'punctuation_ratios', 'sentiment_scores_prompt']





In [None]:
gkf = GroupKFold(n_splits=CFG.n_splits)

# for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
#     train.loc[val_index, "fold"] = i

# train.head()

## Model Function Definition

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

## Deberta Regressor

In [None]:
class ScoreRegressor:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: list,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "fixed_summary_text"] # fix summary text have prompt text in it 
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = target

        self.model_name = model_name
        lr = str(CFG.learning_rate).replace(".", "")
        self.model_dir = model_dir
        self.max_length = max_length
        
        self.tokenizer = AutoTokenizer.from_pretrained(f"input/{model_name}")
        self.model_config = AutoConfig.from_pretrained(f"input/{model_name}" )
        # print(self.model_config)
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 2,
            "problem_type": "regression",
        })
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        # labels = ['content' , 'wording']
        # print('labels', labels)
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": [examples['content'], examples['wording']],
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def train(self, 
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""
        
        sep = self.tokenizer.sep_token
        # print('sep', sep)
        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep 
                    + train_df["prompt_question"] + sep 
                    + train_df["fixed_summary_text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep 
                    + valid_df["prompt_question"] + sep 
                    + valid_df["fixed_summary_text"]
                  )
        # filter train_df with input_col have more than 5000 tokens
        # print('create train and val data frame ')
        # print('self.target_cols', self.target_cols)
        # print('self.input_col', self.input_col)
        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]
        
        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"input/{self.model_name}", 
            config=self.model_config
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=True)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=True)
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        # print('model_fold_dir', model_fold_dir)
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )
        # print('define trainer')
        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )
        print('start training')
        # print('trainer.train_dataset[0]' , trainer.train_dataset[0])
        trainer.train()
        print('finish training')
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["fixed_summary_text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=True)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        # print("model_fold_dir",model_fold_dir)
        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = CFG.batch_size,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

## Train by fold function


In [None]:
# def validate(
#     train_df: pd.DataFrame,
#     target:str,
#     save_each_model: bool,
#     model_name: str,
#     model_dir_base: str,
#     hidden_dropout_prob: float,
#     attention_probs_dropout_prob: float,
#     max_length : int
#     ) -> pd.DataFrame:
#     """predict oof data"""
#     for fold in range(CFG.n_splits):
#         # print(f"fold {fold}:")
        
#         valid_data = train_df[train_df["fold"] == fold]
        
#         if save_each_model == True:
#             model_dir =  f"{target}/{model_dir_base}/fold_{fold}"
#         else: 
#             model_dir =  f"{model_dir_base}/fold_{fold}"
#         csr = ScoreRegressor(
#             model_name=model_name,
#             target=target,
#             model_dir = model_dir,
#             hidden_dropout_prob=hidden_dropout_prob,
#             attention_probs_dropout_prob=attention_probs_dropout_prob,
#             max_length=max_length,
#            )
        
#         pred = csr.predict(
#             test_df=valid_data, 
#             fold=fold
#         )
#         # print('pred shape', pred.shape)
#         train_df.loc[valid_data.index, f"wording_pred"] = pred[:,0]
#         train_df.loc[valid_data.index, f"content_pred"] = pred[:,1]

#     return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    model_dir_base: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""
    for fold in range(CFG.n_splits):
        # print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_dir_base}/fold_{fold}"
        else: 
            model_dir =  f"{model_dir_base}/fold_{fold}"
        csr = ScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        # test_df[f"{target}_pred_{fold}"] = pred
        test_df[f"wording_pred_{fold}"] = pred[:,0]
        test_df[f"content_pred_{fold}"] = pred[:,1]
        
    # test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)
    test_df[f"wording_pred"] = test_df[[f"wording_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)
    test_df[f"content_pred"] = test_df[[f"content_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)
    return test_df
targets =  ["content", "wording"]


## Infer



In [None]:
# # ensembling_results_val  = pd.DataFrame()
# ensembling_results_test = pd.DataFrame()
# dem = 0
# if CFG.infer_mode:
#     for model_dir in CFG.list_model_infer:
#         print("percent of model", dem/CFG.number_base_model)
#         print(model_dir)    
#         dem = dem +1 
#         if dem >= CFG.number_base_model:
#             CFG.batch_size = 16
#             CFG.max_length = 1462
#             CFG.model_name = "debertav3large"
#         # train = validate(
#         #     train,
#         #     target=targets,
#         #     save_each_model=False,
#         #     model_name=CFG.model_name,
#         #     model_dir_base = model_dir,
#         #     hidden_dropout_prob=CFG.hidden_dropout_prob,
#         #     attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
#         #     max_length=CFG.max_length
#         # )
#         # for target in targets:
#         #     rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
#         #     print(f"cv {target} rmse: {rmse}")
#         print('done validate')
#         test = predict(
#             test,
#             target=targets,
#             save_each_model=False,
#             model_name=CFG.model_name,
#             model_dir_base = model_dir,
#             hidden_dropout_prob=CFG.hidden_dropout_prob,
#             attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
#             max_length=CFG.max_length
#         )
#         print('done predict')
#         # add wording_pred and content_pred to ensembling_results
#         # ensembling_results_val[f"{model_dir}_wording_pred"] = train["wording_pred"]
#         # ensembling_results_val[f"{model_dir}_content_pred"] = train["content_pred"]
#         ensembling_results_test[f"{model_dir}_wording_pred"] = test["wording_pred"]
#         ensembling_results_test[f"{model_dir}_content_pred"] = test["content_pred"]
#         # print('ensembling_results_val \n', ensembling_results_val.head() )
        

## LGBM model

In [None]:

# save ensembling_results_val and ensembling_results_test
# ensembling_results_val.to_csv("ensembling_results_val.csv", index=False)
# ensembling_results_test.to_csv("ensembling_results_test.csv", index=False)
# load ensembling_results_val and ensembling_results_test
ensembling_results_val = pd.read_csv("ensembling_results_val.csv")
# ensembling_results_test = pd.read_csv("ensembling_results_test.csv")

In [None]:
ensembling_results_val

Unnamed: 0,upload_model/debertav3base_lr5e-05_wording_pred,upload_model/debertav3base_lr5e-05_content_pred,upload_model/debertav3base_lr15e-05_wording_pred,upload_model/debertav3base_lr15e-05_content_pred,upload_model/debertav3base_lr17e-05_wording_pred,upload_model/debertav3base_lr17e-05_content_pred,upload_model/debertav3base_lr18e-05_wording_pred,upload_model/debertav3base_lr18e-05_content_pred,upload_model/debertav3base_lr21e-05_wording_pred,upload_model/debertav3base_lr21e-05_content_pred,...,debertav3large_lr14e-05_att_0007_wording_pred,debertav3large_lr14e-05_att_0007_content_pred,debertav3large_lr15e-05_att_0007_wording_pred,debertav3large_lr15e-05_att_0007_content_pred,debertav3large_lr16e-05_att_0007_wording_pred,debertav3large_lr16e-05_att_0007_content_pred,debertav3large_lr17e-05_att_0007_wording_pred,debertav3large_lr17e-05_att_0007_content_pred,debertav3large_lr18e-05_att_0007_wording_pred,debertav3large_lr18e-05_att_0007_content_pred
0,0.289577,0.754494,0.255495,0.757381,0.253911,0.798586,0.238143,0.805690,0.226564,0.868932,...,0.050072,0.570128,-0.130428,0.636849,0.146617,0.691859,-0.035948,0.880852,-0.025384,0.916015
1,-0.702410,-0.433037,-0.801842,-0.464458,-0.772340,-0.413345,-0.665685,-0.327464,-0.641115,-0.371242,...,-0.851928,-0.071699,-1.015396,-0.227899,-0.790502,-0.701932,-0.893634,-0.387197,-1.166269,-0.696791
2,2.476874,1.968442,2.216656,2.133213,2.264356,2.114351,2.413981,2.368245,2.114522,2.113158,...,2.413287,2.190674,2.293101,2.366125,1.455727,0.879681,2.042683,2.011548,2.101691,2.138166
3,-1.046302,-0.884815,-1.117746,-0.960289,-1.006900,-0.931899,-0.985633,-0.782978,-1.084898,-0.928009,...,-1.108393,-0.970286,-1.204046,-1.049266,-0.896619,-0.799949,-1.103294,-0.969418,-1.277837,-1.192987
4,2.218238,1.984394,2.125614,2.388381,2.147007,2.516407,2.084431,2.448008,2.143604,2.350667,...,2.187430,2.335758,2.327338,2.474953,2.385828,2.251376,1.824292,2.288516,2.421074,2.707077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7160,-0.084784,-0.080262,-0.192514,-0.029484,-0.218292,-0.110078,-0.208576,-0.138276,-0.080752,-0.022244,...,-0.128036,0.122814,-0.145221,-0.106024,-0.461274,-0.669521,-0.157177,0.020587,-0.237773,-0.200248
7161,-0.433700,-0.205239,-0.335272,-0.031181,-0.366315,-0.163821,-0.431232,-0.058000,-0.449126,-0.166257,...,-0.467294,-0.368578,-0.369047,-0.174953,-0.417107,-0.321235,-0.416164,-0.077031,-0.374409,-0.275327
7162,-1.026422,-1.043301,-0.790768,-0.539467,-0.878920,-0.593079,-1.000468,-0.721421,-0.949790,-0.683982,...,-0.897805,-0.603563,-1.005552,-0.796761,-0.633984,-0.617932,-1.140137,-0.791222,-0.894914,-0.417379
7163,-0.099472,0.458754,0.150375,0.407218,0.054952,0.478801,-0.008228,0.426154,0.015571,0.472524,...,-0.040969,0.603768,0.022594,0.560333,0.020776,0.301249,-0.103226,0.218670,-0.003400,0.486490


In [None]:
# replace upload_model/ with ""
# ensembling_results_val = ensembling_results_val.rename(columns=lambda x: x.replace("upload_model/", ""))

In [None]:
ensembling_results_val.head()

Unnamed: 0,upload_model/debertav3base_lr5e-05_wording_pred,upload_model/debertav3base_lr5e-05_content_pred,upload_model/debertav3base_lr15e-05_wording_pred,upload_model/debertav3base_lr15e-05_content_pred,upload_model/debertav3base_lr17e-05_wording_pred,upload_model/debertav3base_lr17e-05_content_pred,upload_model/debertav3base_lr18e-05_wording_pred,upload_model/debertav3base_lr18e-05_content_pred,upload_model/debertav3base_lr21e-05_wording_pred,upload_model/debertav3base_lr21e-05_content_pred,...,debertav3large_lr14e-05_att_0007_wording_pred,debertav3large_lr14e-05_att_0007_content_pred,debertav3large_lr15e-05_att_0007_wording_pred,debertav3large_lr15e-05_att_0007_content_pred,debertav3large_lr16e-05_att_0007_wording_pred,debertav3large_lr16e-05_att_0007_content_pred,debertav3large_lr17e-05_att_0007_wording_pred,debertav3large_lr17e-05_att_0007_content_pred,debertav3large_lr18e-05_att_0007_wording_pred,debertav3large_lr18e-05_att_0007_content_pred
0,0.289577,0.754494,0.255495,0.757381,0.253911,0.798586,0.238143,0.80569,0.226564,0.868932,...,0.050072,0.570128,-0.130428,0.636849,0.146617,0.691859,-0.035948,0.880852,-0.025384,0.916015
1,-0.70241,-0.433037,-0.801842,-0.464458,-0.77234,-0.413345,-0.665685,-0.327464,-0.641115,-0.371242,...,-0.851928,-0.071699,-1.015396,-0.227899,-0.790502,-0.701932,-0.893634,-0.387197,-1.166269,-0.696791
2,2.476874,1.968442,2.216656,2.133213,2.264356,2.114351,2.413981,2.368245,2.114522,2.113158,...,2.413287,2.190674,2.293101,2.366125,1.455727,0.879681,2.042683,2.011548,2.101691,2.138166
3,-1.046302,-0.884815,-1.117746,-0.960289,-1.0069,-0.931899,-0.985633,-0.782978,-1.084898,-0.928009,...,-1.108393,-0.970286,-1.204046,-1.049266,-0.896619,-0.799949,-1.103294,-0.969418,-1.277837,-1.192987
4,2.218238,1.984394,2.125614,2.388381,2.147007,2.516407,2.084431,2.448008,2.143604,2.350667,...,2.18743,2.335758,2.327338,2.474953,2.385828,2.251376,1.824292,2.288516,2.421074,2.707077


## Find the best weight with optuna

In [None]:
# CFG.list_model_infer = [
#         # 'upload_model/debertav3base_lr15e-05',
#         'upload_model/debertav3base_lr17e-05', #keep
#         'upload_model/debertav3base_lr18e-05', #keep
#         'upload_model/debertav3base_lr21e-05', # keep
#         'upload_model/debertav3base_lr22e-05', #keep 
#         # 'upload_model/debertav3base_lr5e-05', 
#         'upload_model/debertav3large_lr12e-05', # upload
#         'upload_model/debertav3large_lr13e-05',  # upload
#         # 'debertav3large_lr1e-05_save',
#         # 'debertav3large_lr1e-05_att_0007',
#         'debertav3large_lr8e-06_att_0007', # upload 
#         'debertav3large_lr9e-06_att_0007', # upload 
#         'debertav3large_lr11e-05_att_0007',
#         'debertav3large_lr12e-05_att_0007',
#         'debertav3large_lr13e-05_att_0007',
#         'debertav3large_lr14e-05_att_0007',
#         'debertav3large_lr15e-05_att_0007', # upload 
#         'debertav3large_lr16e-05_att_0007', # upload 
#         'debertav3large_lr17e-05_att_0007', # upload 
#         'debertav3large_lr18e-05_att_0007', # upload 
#         ]
# print(len(CFG.list_model_infer))

In [None]:
# weight_for_model = {}
# scale = 0 
# for model in CFG.list_model_infer:
#     results = ensembling_results_val[[f"{model}_wording_pred", f"{model}_content_pred"]]
#     mcrmse = compute_mcrmse((results.values, train[targets].values))
#     print(f"{model} mcrmse: {mcrmse['mcrmse']}")
#     # weight_for_model[model] = 1 / mcrmse["mcrmse"]
#     # scale = scale + weight_for_model[model]
    

In [None]:
weight_for_model =  {       
    "upload_model/debertav3base_lr17e-05": 0.212909408976402    ,
    "upload_model/debertav3base_lr18e-05": 0.1421852032796326,
    "upload_model/debertav3base_lr21e-05": 0.8953672127134387,
    "upload_model/debertav3base_lr22e-05": 0.36047950541959095,
    "upload_model/debertav3large_lr12e-05": 0.9413883882391965,
    "upload_model/debertav3large_lr13e-05": 0.48278638692252185,
    "debertav3large_lr8e-06_att_0007": 0.30564935544213445,
    "debertav3large_lr9e-06_att_0007": 0.40560485262757806,
    "debertav3large_lr11e-05_att_0007": 0.882846667614644,
    "debertav3large_lr12e-05_att_0007": 0.09507155361092456,
    "debertav3large_lr13e-05_att_0007": 0.6241475562582361,
    "debertav3large_lr14e-05_att_0007": 0.03402452758512511,
    "debertav3large_lr15e-05_att_0007": 0.11628723374671882,
    "debertav3large_lr16e-05_att_0007": 0.0001281844830617676,
    "debertav3large_lr17e-05_att_0007": 0.969431755664122,
    "debertav3large_lr18e-05_att_0007": 0.318050401669718,
}
print(len(weight_for_model))
scale = np.sum(list(weight_for_model.values()))
print('scale', scale)

16
scale 6.7863581942530455


In [None]:
train = pd.read_csv("input/train.csv")

In [None]:
for model in CFG.list_model_infer:
    weight_for_model[model] = 1
scale = np.sum(list(weight_for_model.values()))

In [None]:
train[f"wording_pred"] = np.sum([ensembling_results_val[f"{model}_wording_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale    
train[f"content_pred"] = np.sum([ensembling_results_val[f"{model}_content_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale
# test[f"wording_pred"] = np.sum([ensembling_results_test[f"{model}_wording_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale
# test[f"content_pred"] = np.sum([ensembling_results_test[f"{model}_content_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale


In [None]:
# save train and test
# train.to_csv("input/train.csv", index=False)
# load train and test
# test = pd.read_csv("input/test.csv")

In [None]:
# train.head()

In [None]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text"
               ] + targets

In [None]:
def create_model_dict(targets,train):
  model_dict = {}
  for target in targets:
      models = []

      for fold in range(CFG.n_splits):
          X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns, inplace=False)
          y_train_cv = train[train["fold"] != fold][target]

          X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
          y_eval_cv = train[train["fold"] == fold][target]

          dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
          dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

          params = {
              'boosting_type': 'gbdt',
              'random_state': 42,
              'objective': 'regression',
              'metric': 'rmse',
              'learning_rate': 0.048,
              'max_depth': 3,
              'lambda_l1': 0.0,
              'lambda_l2': 0.011,
              'verbose': -1,
          }

          evaluation_results = {}
          model = lgb.train(params,
                            num_boost_round=10000,
                            valid_names=['train', 'valid'],
                            train_set=dtrain,
                            valid_sets=dval,
                            callbacks=[
                                lgb.early_stopping(stopping_rounds=70, verbose=False),
                                # lgb.log_evaluation(100),
                                lgb.callback.record_evaluation(evaluation_results)
                              ],
                            )
          models.append(model)

      model_dict[target] = models
  return model_dict
model_dict = create_model_dict(targets,train)


## CV Score

In [None]:
# cv
# import optuna
def cal_mcrmse(model_dict, targets):
    rmses = []
    for target in targets:
        models = model_dict[target]

        preds = []
        trues = []
        
        for fold, model in enumerate(models):
            X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns , inplace=False)
            y_eval_cv = train[train["fold"] == fold][target]

            pred = model.predict(X_eval_cv)

            trues.extend(y_eval_cv)
            preds.extend(pred)
            
        rmse = np.sqrt(mean_squared_error(trues, preds))
        # print(f"{target}_rmse : {rmse}")
        rmses = rmses + [rmse]
    return sum(rmses) / len(rmses)
mcrmse = cal_mcrmse(model_dict, targets)
print(f"mcrmse: {mcrmse}")

mcrmse: 1.0641848929195725


## Optuna

In [None]:
import optuna
def objective(trial):
    weight_for_model = {}
    for model in CFG.list_model_infer:
        weight_for_model[model] = trial.suggest_float(model, -0.5, 1.0)
    scale = np.sum([weight_for_model[model] for model in CFG.list_model_infer])
    train[f"wording_pred"] = np.sum([ensembling_results_val[f"{model}_wording_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale    
    train[f"content_pred"] = np.sum([ensembling_results_val[f"{model}_content_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale
    model_dict = create_model_dict(targets,train)
    lost = cal_mcrmse(model_dict, targets)
    print(f"mcrmse: {lost}")
    # print(f"weight_for_model: {weight_for_model}")
    return lost

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)

mcrmse: 0.4842790883422752
mcrmse: 0.48193215160166303
mcrmse: 0.4882895789147218
mcrmse: 0.48268891130391006
mcrmse: 0.5065932850412581
mcrmse: 0.4885858896374236
mcrmse: 0.48480572815055534
mcrmse: 0.49032664563749934
mcrmse: 0.4823207042880988
mcrmse: 0.49948450434247676
mcrmse: 0.4841628917664917
mcrmse: 0.48341097959343043
mcrmse: 0.48673718746294226
mcrmse: 0.4829622375030531
mcrmse: 0.4820785899612382
mcrmse: 0.48156687710072676
mcrmse: 0.4836273295673663
mcrmse: 0.4794385294815756
mcrmse: 0.4802068429308679
mcrmse: 0.48025715570661986
mcrmse: 0.48046003812444016
mcrmse: 0.4801421647134472
mcrmse: 0.4802825342619865
mcrmse: 0.47932986118615
mcrmse: 0.48054872612151317
mcrmse: 0.4792728334449834
mcrmse: 0.47886721433626933
mcrmse: 0.47893867061690254
mcrmse: 0.4804847763696122
mcrmse: 0.4794998538312514
mcrmse: 0.480440017151915
mcrmse: 0.4794425253592045
mcrmse: 0.48012817063668956
mcrmse: 0.48063366520338335
mcrmse: 0.4793793351820571
mcrmse: 0.48027276279550557
mcrmse: 0.48113

In [None]:

print('Best trial:')
trial_ = study.best_trial

print('Value: ', trial_.value)

for model in CFG.list_model_infer:
    weight_for_model[model] = trial_.params[model]
weight_for_model


Best trial:
Value:  0.4764391446022248


{'upload_model/debertav3base_lr17e-05': 0.050140453759609765,
 'upload_model/debertav3base_lr18e-05': 0.07052951309192787,
 'upload_model/debertav3base_lr21e-05': -0.2737032505249057,
 'upload_model/debertav3base_lr22e-05': 0.7323608145720258,
 'upload_model/debertav3large_lr12e-05': 0.17739633753942907,
 'upload_model/debertav3large_lr13e-05': 0.3487763171688307,
 'debertav3large_lr8e-06_att_0007': 0.40208654403767585,
 'debertav3large_lr9e-06_att_0007': 0.6324542274221961,
 'debertav3large_lr11e-05_att_0007': 0.5861786287038867,
 'debertav3large_lr12e-05_att_0007': -0.2614732225436932,
 'debertav3large_lr13e-05_att_0007': 0.3223536795999506,
 'debertav3large_lr14e-05_att_0007': -0.05707090804260281,
 'debertav3large_lr15e-05_att_0007': 0.30642565495970764,
 'debertav3large_lr16e-05_att_0007': -0.025179008462274423,
 'debertav3large_lr17e-05_att_0007': 0.28841239875231883,
 'debertav3large_lr18e-05_att_0007': 0.15050522437804395}

In [None]:
for model in CFG.list_model_infer:
    weight_for_model[model] = trial_.params[model]
weight_for_model

{'upload_model/debertav3base_lr17e-05': 0.050140453759609765,
 'upload_model/debertav3base_lr18e-05': 0.07052951309192787,
 'upload_model/debertav3base_lr21e-05': -0.2737032505249057,
 'upload_model/debertav3base_lr22e-05': 0.7323608145720258,
 'upload_model/debertav3large_lr12e-05': 0.17739633753942907,
 'upload_model/debertav3large_lr13e-05': 0.3487763171688307,
 'debertav3large_lr8e-06_att_0007': 0.40208654403767585,
 'debertav3large_lr9e-06_att_0007': 0.6324542274221961,
 'debertav3large_lr11e-05_att_0007': 0.5861786287038867,
 'debertav3large_lr12e-05_att_0007': -0.2614732225436932,
 'debertav3large_lr13e-05_att_0007': 0.3223536795999506,
 'debertav3large_lr14e-05_att_0007': -0.05707090804260281,
 'debertav3large_lr15e-05_att_0007': 0.30642565495970764,
 'debertav3large_lr16e-05_att_0007': -0.025179008462274423,
 'debertav3large_lr17e-05_att_0007': 0.28841239875231883,
 'debertav3large_lr18e-05_att_0007': 0.15050522437804395}

## Predict

In [None]:
# Best trial:
# Value:  0.4764391446022248
# Params: 
#     upload_model/debertav3base_lr17e-05: 0.050140453759609765
#     upload_model/debertav3base_lr18e-05: 0.07052951309192787
#     upload_model/debertav3base_lr21e-05: -0.2737032505249057
#     upload_model/debertav3base_lr22e-05: 0.7323608145720258
#     upload_model/debertav3large_lr12e-05: 0.17739633753942907
#     upload_model/debertav3large_lr13e-05: 0.3487763171688307
#     debertav3large_lr8e-06_att_0007: 0.40208654403767585
#     debertav3large_lr9e-06_att_0007: 0.6324542274221961
#     debertav3large_lr11e-05_att_0007: 0.5861786287038867
#     debertav3large_lr12e-05_att_0007: -0.2614732225436932
#     debertav3large_lr13e-05_att_0007: 0.3223536795999506
#     debertav3large_lr14e-05_att_0007: -0.05707090804260281
#     debertav3large_lr15e-05_att_0007: 0.30642565495970764
#     debertav3large_lr16e-05_att_0007: -0.025179008462274423
#     debertav3large_lr17e-05_att_0007: 0.28841239875231883
#     debertav3large_lr18e-05_att_0007: 0.15050522437804395

In [None]:
drop_columns_2 = [
                # "fold", 
                "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text",
                "input"
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [None]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns_2)
        # print(X_eval_cv.head())
        pred = model.predict(X_eval_cv)
        # print('pred shape'  , pred.shape)
        preds.append(pred)
    
    pred_dict[target] = preds

In [None]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

In [None]:
test

Unnamed: 0,student_id,prompt_id,text,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,...,wording_pred_1,content_pred_1,wording_pred_2,content_pred_2,wording_pred_3,content_pred_3,wording_pred,content_pred,content,wording
0,000000ffffff,abc123,Example text 1,3,Example text 1,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.251299,-1.578506,-1.230973,-1.311372,-1.472722,-1.607912,-1.539744,-1.335907,-1.517739,-1.354057
1,111111eeeeee,def789,Example text 2,3,Example text 2,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.251299,-1.578506,-1.230973,-1.311372,-1.472722,-1.607912,-1.542108,-1.33661,-1.517739,-1.354057
2,222222cccccc,abc123,Example text 3,3,Example text 3,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.251299,-1.578506,-1.230973,-1.311372,-1.472722,-1.607912,-1.541584,-1.344446,-1.517739,-1.354057
3,333333dddddd,def789,Example text 4,3,Example text 4,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.251299,-1.578506,-1.230973,-1.311372,-1.472722,-1.607912,-1.544137,-1.341062,-1.517739,-1.354057


## Create Submission file

In [None]:
sample_submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,0.0,0.0
1,111111eeeeee,0.0,0.0
2,222222cccccc,0.0,0.0
3,333333dddddd,0.0,0.0


In [None]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)

## Summary

CV result is like this.

| | content rmse |wording rmse | mcrmse | LB| |
| -- | -- | -- | -- | -- | -- |
|baseline| 0.494 | 0.630 | 0.562 | 0.509 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-baseline-content-and-wording-models)|
| use title and question field | 0.476| 0.619 | 0.548 | 0.508 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-w-prompt-title-question-fields) |
| Debertav3 + LGBM | 0.451 | 0.591 | 0.521 | 0.461 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-lgbm-with-feature-engineering) |
| Debertav3 + LGBM with spell autocorrect | 0.448 | 0.581 | 0.514 | 0.459 |nogawanogawa's original code
| Debertav3 + LGBM with spell autocorrect and tuning | 0.442 | 0.566 | 0.504 | 0.453 | this notebook |

The CV values improved slightly, and the LB value is improved.