In [328]:
# !pip install ".inputautocorrect/autocorrect-2.6.1.tar"
# !pip install ".inputpyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"


In [329]:
# nltk.download("punkt")

In [330]:
from typing import List
import numpy as np
import pandas as pd
import warnings
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
import lightgbm as lgb
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [331]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True # load seed
    
seed_everything(seed=42)

## Class CFG

In [332]:
class CFG:
    model_name="debertav3base"
    learning_rate=0.000016
    weight_decay=0.03
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=5
    n_splits=4
    batch_size= 128
    random_seed=42
    save_steps=100
    max_length= 512
    number_base_model = 2
    test_mode = False
    device = 'CPU'
    infer_mode = True
    list_model_infer = [
        'upload_model/debertav3base_lr5e-05',
        'upload_model/debertav3base_lr15e-05',
        'upload_model/debertav3base_lr17e-05',
        'upload_model/debertav3base_lr18e-05',
        'upload_model/debertav3base_lr21e-05',
        'upload_model/debertav3base_lr22e-05',
        'upload_model/debertav3large_lr13e-05',
        'upload_model/debertav3large_lr12e-05',
        'debertav3large_lr1e-05_att_0007',
        'debertav3large_lr1e-05_save',
        'debertav3large_lr8e-06_att_0007',
        'debertav3large_lr9e-06_att_0007',
        'debertav3large_lr11e-05_att_0007',
        'debertav3large_lr12e-05_att_0007',
        'debertav3large_lr13e-05_att_0007',
        'debertav3large_lr14e-05_att_0007',
        'debertav3large_lr15e-05_att_0007',
        'debertav3large_lr16e-05_att_0007',
        'debertav3large_lr17e-05_att_0007',
        'debertav3large_lr18e-05_att_0007',
        ]
    

In [333]:
# print device
if CFG.device != 'CPU':
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # print device 
else :
    device = torch.device("cpu")
print(device)

cpu


## Dataload

In [334]:
DATA_DIR = "input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

## Exploratory Data Analysis

In [335]:
prompts_train.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


## Preprocess

[Using features]

- Text Length
- Length Ratio
- Word Overlap
- N-grams Co-occurrence
  - count
  - ratio
- Quotes Overlap
- Grammar Check
  - spelling: pyspellchecker


In [336]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(f"input/{model_name}")
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
        #         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        ) # fix mission spelling
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling) # count number of misspelling 
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id") 

        # after merge preprocess
        # input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    


## Preprocess 2


In [337]:
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import ne_chunk, word_tokenize, pos_tag
# nltk.downloader.download('vader_lexicon')
import pyphen
from nltk.sentiment import SentimentIntensityAnalyzer

dic = pyphen.Pyphen(lang='en')
sid = SentimentIntensityAnalyzer()

class Preprocessor2:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(f"input/{model_name}")
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def calculate_text_similarity(self, row):
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([row['prompt_text'], row['text']])
        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2]).flatten()[0]
    
    def sentiment_analysis(self, text):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity, analysis.sentiment.subjectivity
    
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def calculate_unique_words(self,text):
        unique_words = set(text.split())
        return len(unique_words)
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
        
    def calculate_pos_ratios(self , text):
        pos_tags = pos_tag(nltk.word_tokenize(text))
        pos_counts = Counter(tag for word, tag in pos_tags)
        total_words = len(pos_tags)
        ratios = {tag: count / total_words for tag, count in pos_counts.items()}
        return ratios
    
    def calculate_punctuation_ratios(self,text):
        total_chars = len(text)
        punctuation_counts = Counter(char for char in text if char in '.,!?;:"()[]{}')
        ratios = {char: count / total_chars for char, count in punctuation_counts.items()}
        return ratios
    
    def calculate_keyword_density(self,row):
        keywords = set(row['prompt_text'].split())
        text_words = row['text'].split()
        keyword_count = sum(1 for word in text_words if word in keywords)
        return keyword_count / len(text_words)
    
    def count_syllables(self,word):
        hyphenated_word = dic.inserted(word)
        return len(hyphenated_word.split('-'))

    def flesch_reading_ease_manual(self,text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        total_syllables = sum(self.count_syllables(word) for word in TextBlob(text).words)

        if total_sentences == 0 or total_words == 0:
            return 0

        flesch_score = 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (total_syllables / total_words)
        return flesch_score
    
    def flesch_kincaid_grade_level(self, text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        total_syllables = sum(self.count_syllables(word) for word in TextBlob(text).words)

        if total_sentences == 0 or total_words == 0:
            return 0

        fk_grade = 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59
        return fk_grade
    
    def gunning_fog(self, text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        complex_words = sum(1 for word in TextBlob(text).words if self.count_syllables(word) > 2)

        if total_sentences == 0 or total_words == 0:
            return 0

        fog_index = 0.4 * ((total_words / total_sentences) + 100 * (complex_words / total_words))
        return fog_index
    
    def calculate_sentiment_scores(self,text):
        sentiment_scores = sid.polarity_scores(text)
        return sentiment_scores
    
    def count_difficult_words(self, text, syllable_threshold=3):
        words = TextBlob(text).words
        difficult_words_count = sum(1 for word in words if self.count_syllables(word) >= syllable_threshold)
        return difficult_words_count


    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
        prompts['gunning_fog_prompt'] = prompts['prompt_text'].apply(self.gunning_fog)
        prompts['flesch_kincaid_grade_level_prompt'] = prompts['prompt_text'].apply(self.flesch_kincaid_grade_level)
        prompts['flesch_reading_ease_prompt'] = prompts['prompt_text'].apply(self.flesch_reading_ease_manual)

        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")
        input_df['flesch_reading_ease'] = input_df['text'].apply(self.flesch_reading_ease_manual)
        input_df['word_count'] = input_df['text'].apply(lambda x: len(x.split()))
        input_df['sentence_length'] = input_df['text'].apply(lambda x: len(x.split('.')))
        input_df['vocabulary_richness'] = input_df['text'].apply(lambda x: len(set(x.split())))

        input_df['word_count2'] = [len(t.split(' ')) for t in input_df.text]
        input_df['num_unq_words']=[len(list(set(x.lower().split(' ')))) for x in input_df.text]
        input_df['num_chars']= [len(x) for x in input_df.text]

        # Additional features
        input_df['avg_word_length'] = input_df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]))
        input_df['comma_count'] = input_df['text'].apply(lambda x: x.count(','))
        input_df['semicolon_count'] = input_df['text'].apply(lambda x: x.count(';'))

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        input_df['exclamation_count'] = input_df['text'].apply(lambda x: x.count('!'))
        input_df['question_count'] = input_df['text'].apply(lambda x: x.count('?'))
        input_df['pos_ratios'] = input_df['text'].apply(self.calculate_pos_ratios)

        # Convert the dictionary of POS ratios into a single value (mean)
        input_df['pos_mean'] = input_df['pos_ratios'].apply(lambda x: np.mean(list(x.values())))
        input_df['punctuation_ratios'] = input_df['text'].apply(self.calculate_punctuation_ratios)

        # Convert the dictionary of punctuation ratios into a single value (sum)
        input_df['punctuation_sum'] = input_df['punctuation_ratios'].apply(lambda x: np.sum(list(x.values())))
        input_df['keyword_density'] = input_df.apply(self.calculate_keyword_density, axis=1)
        input_df['jaccard_similarity'] = input_df.apply(lambda row: len(set(word_tokenize(row['prompt_text'])) & set(word_tokenize(row['text']))) / len(set(word_tokenize(row['prompt_text'])) | set(word_tokenize(row['text']))), axis=1)
        tqdm.pandas(desc="Performing Sentiment Analysis")
        input_df[['sentiment_polarity', 'sentiment_subjectivity']] = input_df['text'].progress_apply(
            lambda x: pd.Series(self.sentiment_analysis(x))
        )
        tqdm.pandas(desc="Calculating Text Similarity")
        input_df['text_similarity'] = input_df.progress_apply(self.calculate_text_similarity, axis=1)
        #Calculate sentiment scores for each row
        input_df['sentiment_scores'] = input_df['text'].apply(self.calculate_sentiment_scores)
        
        input_df['gunning_fog'] = input_df['text'].apply(self.gunning_fog)
        input_df['flesch_kincaid_grade_level'] = input_df['text'].apply(self.flesch_kincaid_grade_level)
        input_df['count_difficult_words'] = input_df['text'].apply(self.count_difficult_words)

        # Convert sentiment_scores into individual columns
        sentiment_columns = pd.DataFrame(list(input_df['sentiment_scores']))
        input_df = pd.concat([input_df, sentiment_columns], axis=1)
        input_df['sentiment_scores_prompt'] = input_df['prompt_text'].apply(self.calculate_sentiment_scores)
        # Convert sentiment_scores_prompt into individual columns
        sentiment_columns_prompt = pd.DataFrame(list(input_df['sentiment_scores_prompt']))
        sentiment_columns_prompt.columns = [col +'_prompt' for col in sentiment_columns_prompt.columns]
        input_df = pd.concat([input_df, sentiment_columns_prompt], axis=1)
        columns =  ['pos_ratios', 'sentiment_scores', 'punctuation_ratios', 'sentiment_scores_prompt']
        cols_to_drop = [col for col in columns if col in input_df.columns]
        if cols_to_drop:
            input_df = input_df.drop(columns=cols_to_drop)
        
        print(cols_to_drop)
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    

In [338]:
preprocessor = Preprocessor(model_name=CFG.model_name)

## Create the train and test sets


In [339]:
if CFG.test_mode : 
    prompts_train = prompts_train[:12]
    prompts_test = prompts_test[:12]
    summaries_train = summaries_train[:12]
    summaries_test = summaries_test[:12]

In [341]:
train = preprocessor.run(prompts_train, summaries_train, mode="train")
test = preprocessor.run(prompts_test, summaries_test, mode="test")
# save train and test 
# train.to_csv("input/train.csv", index=False)
# test.to_csv("input/test.csv", index=False)
# load train and test
# train = pd.read_csv("input/train.csv")
# test = pd.read_csv("input/test.csv")
# train.head()

100%|██████████| 7165/7165 [02:17<00:00, 52.06it/s]
100%|██████████| 7165/7165 [00:00<00:00, 20155.88it/s]
100%|██████████| 7165/7165 [00:00<00:00, 21078.41it/s]
100%|██████████| 7165/7165 [00:00<00:00, 12506.47it/s]
100%|██████████| 7165/7165 [00:00<00:00, 11009.14it/s]
100%|██████████| 7165/7165 [00:00<00:00, 207185.03it/s]
100%|██████████| 4/4 [00:00<00:00, 23530.46it/s]
100%|██████████| 4/4 [00:00<00:00, 29485.44it/s]
100%|██████████| 4/4 [00:00<00:00, 9782.63it/s]
100%|██████████| 4/4 [00:00<00:00, 11335.96it/s]
100%|██████████| 4/4 [00:00<00:00, 11244.78it/s]
100%|██████████| 4/4 [00:00<00:00, 5562.74it/s]


In [342]:
gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

# train.head()

## Model Function Definition

In [343]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

## Deberta Regressor

In [344]:
class ScoreRegressor:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: list,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "fixed_summary_text"] # fix summary text have prompt text in it 
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = target

        self.model_name = model_name
        lr = str(CFG.learning_rate).replace(".", "")
        self.model_dir = model_dir
        self.max_length = max_length
        
        self.tokenizer = AutoTokenizer.from_pretrained(f"input/{model_name}")
        self.model_config = AutoConfig.from_pretrained(f"input/{model_name}" )
        # print(self.model_config)
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 2,
            "problem_type": "regression",
        })
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        # labels = ['content' , 'wording']
        # print('labels', labels)
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": [examples['content'], examples['wording']],
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def train(self, 
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""
        
        sep = self.tokenizer.sep_token
        # print('sep', sep)
        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep 
                    + train_df["prompt_question"] + sep 
                    + train_df["fixed_summary_text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep 
                    + valid_df["prompt_question"] + sep 
                    + valid_df["fixed_summary_text"]
                  )
        # filter train_df with input_col have more than 5000 tokens
        # print('create train and val data frame ')
        # print('self.target_cols', self.target_cols)
        # print('self.input_col', self.input_col)
        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]
        
        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"input/{self.model_name}", 
            config=self.model_config
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=True)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=True)
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        # print('model_fold_dir', model_fold_dir)
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )
        # print('define trainer')
        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )
        print('start training')
        # print('trainer.train_dataset[0]' , trainer.train_dataset[0])
        trainer.train()
        print('finish training')
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["fixed_summary_text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=True)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        # print("model_fold_dir",model_fold_dir)
        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = CFG.batch_size,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

## Train by fold function


In [345]:
def validate(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    model_dir_base: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    for fold in range(CFG.n_splits):
        # print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_dir_base}/fold_{fold}"
        else: 
            model_dir =  f"{model_dir_base}/fold_{fold}"
        csr = ScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        # print('pred shape', pred.shape)
        train_df.loc[valid_data.index, f"wording_pred"] = pred[:,0]
        train_df.loc[valid_data.index, f"content_pred"] = pred[:,1]

    return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    model_dir_base: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""
    for fold in range(CFG.n_splits):
        # print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_dir_base}/fold_{fold}"
        else: 
            model_dir =  f"{model_dir_base}/fold_{fold}"
        csr = ScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        # test_df[f"{target}_pred_{fold}"] = pred
        test_df[f"wording_pred_{fold}"] = pred[:,0]
        test_df[f"content_pred_{fold}"] = pred[:,1]
        
    # test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)
    test_df[f"wording_pred"] = test_df[[f"wording_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)
    test_df[f"content_pred"] = test_df[[f"content_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)
    return test_df
targets =  ["content", "wording"]


## Infer



In [347]:
# # ensembling_results_val  = pd.DataFrame()
# # ensembling_results_test = pd.DataFrame()
# dem = 0
# if CFG.infer_mode:
#     for model_dir in CFG.list_model_infer:
#         print("percent of model", dem/CFG.number_base_model)
#         print(model_dir)    
#         dem = dem +1 
#         if dem >= CFG.number_base_model:
#             CFG.batch_size = 16
#             CFG.max_length = 1462
#             CFG.model_name = "debertav3large"
#         train = validate(
#             train,
#             target=targets,
#             save_each_model=False,
#             model_name=CFG.model_name,
#             model_dir_base = model_dir,
#             hidden_dropout_prob=CFG.hidden_dropout_prob,
#             attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
#             max_length=CFG.max_length
#         )
#         # for target in targets:
#         #     rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
#         #     print(f"cv {target} rmse: {rmse}")
#         print('done validate')
#         test = predict(
#             test,
#             target=targets,
#             save_each_model=False,
#             model_name=CFG.model_name,
#             model_dir_base = model_dir,
#             hidden_dropout_prob=CFG.hidden_dropout_prob,
#             attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
#             max_length=CFG.max_length
#         )
#         print('done predict')
#         # add wording_pred and content_pred to ensembling_results
#         ensembling_results_val[f"{model_dir}_wording_pred"] = train["wording_pred"]
#         ensembling_results_val[f"{model_dir}_content_pred"] = train["content_pred"]
#         ensembling_results_test[f"{model_dir}_wording_pred"] = test["wording_pred"]
#         ensembling_results_test[f"{model_dir}_content_pred"] = test["content_pred"]
#         # print('ensembling_results_val \n', ensembling_results_val.head() )
        

## LGBM model

In [348]:

# save ensembling_results_val and ensembling_results_test
# ensembling_results_val.to_csv("ensembling_results_val.csv", index=False)
# ensembling_results_test.to_csv("ensembling_results_test.csv", index=False)
# load ensembling_results_val and ensembling_results_test
ensembling_results_val = pd.read_csv("ensembling_results_val.csv")
ensembling_results_test = pd.read_csv("ensembling_results_test.csv")

## Find the best weight with optuna

In [349]:
CFG.list_model_infer = [
        # 'upload_model/debertav3base_lr15e-05',
        'upload_model/debertav3base_lr17e-05', #keep
        'upload_model/debertav3base_lr18e-05', #keep
        'upload_model/debertav3base_lr21e-05', # keep
        'upload_model/debertav3base_lr22e-05', #keep 
        # 'upload_model/debertav3base_lr5e-05', 
        'upload_model/debertav3large_lr12e-05', # upload
        'upload_model/debertav3large_lr13e-05',  # upload
        # 'debertav3large_lr1e-05_save',
        # 'debertav3large_lr1e-05_att_0007',
        'debertav3large_lr8e-06_att_0007', # upload 
        'debertav3large_lr9e-06_att_0007', # upload 
        'debertav3large_lr11e-05_att_0007',
        'debertav3large_lr12e-05_att_0007',
        'debertav3large_lr13e-05_att_0007',
        'debertav3large_lr14e-05_att_0007',
        'debertav3large_lr15e-05_att_0007', # upload 
        'debertav3large_lr16e-05_att_0007', # upload 
        'debertav3large_lr17e-05_att_0007', # upload 
        'debertav3large_lr18e-05_att_0007', # upload 
        ]
print(len(CFG.list_model_infer))

16


In [384]:
weight_for_model = {}
scale = 0 
for model in CFG.list_model_infer:
    results = ensembling_results_val[[f"{model}_wording_pred", f"{model}_content_pred"]]
    mcrmse = compute_mcrmse((results.values, train[targets].values))
    print(f"{model} mcrmse: {mcrmse['mcrmse']}")
    # weight_for_model[model] = 1 / mcrmse["mcrmse"]
    # scale = scale + weight_for_model[model]
    

upload_model/debertav3base_lr17e-05 mcrmse: 0.5543694149276929
upload_model/debertav3base_lr18e-05 mcrmse: 0.5526207597677534
upload_model/debertav3base_lr21e-05 mcrmse: 0.553756334918185
upload_model/debertav3base_lr22e-05 mcrmse: 0.5480178090069433
upload_model/debertav3large_lr12e-05 mcrmse: 0.5531885292652218
upload_model/debertav3large_lr13e-05 mcrmse: 0.5520867504987613
debertav3large_lr8e-06_att_0007 mcrmse: 0.5257363371291668
debertav3large_lr9e-06_att_0007 mcrmse: 0.5391015003980028
debertav3large_lr11e-05_att_0007 mcrmse: 0.5395042177968217
debertav3large_lr12e-05_att_0007 mcrmse: 0.5454541263489372
debertav3large_lr13e-05_att_0007 mcrmse: 0.5502121290213762
debertav3large_lr14e-05_att_0007 mcrmse: 0.5392729849094001
debertav3large_lr15e-05_att_0007 mcrmse: 0.5388472988905048
debertav3large_lr16e-05_att_0007 mcrmse: 0.5568832398581905
debertav3large_lr17e-05_att_0007 mcrmse: 0.5517737704359214
debertav3large_lr18e-05_att_0007 mcrmse: 0.5563119146067961


In [385]:
weight_for_model =  {       
    "upload_model/debertav3base_lr17e-05": 0.212909408976402    ,
    "upload_model/debertav3base_lr18e-05": 0.1421852032796326,
    "upload_model/debertav3base_lr21e-05": 0.8953672127134387,
    "upload_model/debertav3base_lr22e-05": 0.36047950541959095,
    "upload_model/debertav3large_lr12e-05": 0.9413883882391965,
    "upload_model/debertav3large_lr13e-05": 0.48278638692252185,
    "debertav3large_lr8e-06_att_0007": 0.30564935544213445,
    "debertav3large_lr9e-06_att_0007": 0.40560485262757806,
    "debertav3large_lr11e-05_att_0007": 0.882846667614644,
    "debertav3large_lr12e-05_att_0007": 0.09507155361092456,
    "debertav3large_lr13e-05_att_0007": 0.6241475562582361,
    "debertav3large_lr14e-05_att_0007": 0.03402452758512511,
    "debertav3large_lr15e-05_att_0007": 0.11628723374671882,
    "debertav3large_lr16e-05_att_0007": 0.0001281844830617676,
    "debertav3large_lr17e-05_att_0007": 0.969431755664122,
    "debertav3large_lr18e-05_att_0007": 0.318050401669718,
}
print(len(weight_for_model))
scale = np.sum(list(weight_for_model.values()))
print('scale', scale)

16
scale 6.7863581942530455


In [386]:
# weight_for_model =  {'upload_model/debertav3base_lr5e-05': 0.051723147603719516,
#  'upload_model/debertav3base_lr15e-05': 0.713986390124981,
#  'upload_model/debertav3base_lr17e-05': 0.21585195973104482,
#  'upload_model/debertav3base_lr18e-05': 0.4991352617947008,
#  'upload_model/debertav3base_lr21e-05': 0.004646508278479511,
#  'upload_model/debertav3base_lr22e-05': 0.11250582743280035,
#  'upload_model/debertav3large_lr12e-05': 0.6062309604440405,
#  'upload_model/debertav3large_lr13e-05': 0.41563814657040926,
#  'debertav3large_lr1e-05_save': 0.8788672880708215,
#  'debertav3large_lr1e-05_att_0007': 0.882589383882597,
#  'debertav3large_lr8e-06_att_0007': 0.7194970862228134,
#  'debertav3large_lr9e-06_att_0007': 0.6181074749297657,
#  'debertav3large_lr11e-05_att_0007': 0.9809193274307263,
#  'debertav3large_lr12e-05_att_0007': 0.17214549469931723,
#  'debertav3large_lr13e-05_att_0007': 0.4328122227545144,
#  'debertav3large_lr14e-05_att_0007': 0.3284035971778506,
#  'debertav3large_lr15e-05_att_0007': 0.9392121766363538,
#  'debertav3large_lr16e-05_att_0007': 0.033996448553002434,
#  'debertav3large_lr17e-05_att_0007': 0.8327105838357515,
#  'debertav3large_lr18e-05_att_0007': 0.8773681981314573}
# print(len(weight_for_model))
# scale = np.sum(list(weight_for_model.values()))
# print('scale', scale)

In [401]:
train = pd.read_csv("input/train.csv")

In [402]:
train[f"wording_pred"] = np.sum([ensembling_results_val[f"{model}_wording_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale    
train[f"content_pred"] = np.sum([ensembling_results_val[f"{model}_content_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale
test[f"wording_pred"] = np.sum([ensembling_results_test[f"{model}_wording_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale
test[f"content_pred"] = np.sum([ensembling_results_test[f"{model}_content_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale


In [403]:
# save train and test
# train.to_csv("input/train.csv", index=False)
# load train and test
# test = pd.read_csv("input/test.csv")

In [404]:
# train.head()

In [405]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text"
               ] + targets

In [406]:
def create_model_dict(targets,train):
  model_dict = {}
  for target in targets:
      models = []

      for fold in range(CFG.n_splits):
          X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns, inplace=False)
          y_train_cv = train[train["fold"] != fold][target]

          X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
          y_eval_cv = train[train["fold"] == fold][target]

          dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
          dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

          params = {
              'boosting_type': 'gbdt',
              'random_state': 42,
              'objective': 'regression',
              'metric': 'rmse',
              'learning_rate': 0.048,
              'max_depth': 3,
              'lambda_l1': 0.0,
              'lambda_l2': 0.011,
              'verbose': -1,
          }

          evaluation_results = {}
          model = lgb.train(params,
                            num_boost_round=10000,
                            valid_names=['train', 'valid'],
                            train_set=dtrain,
                            valid_sets=dval,
                            callbacks=[
                                lgb.early_stopping(stopping_rounds=70, verbose=False),
                                # lgb.log_evaluation(100),
                                lgb.callback.record_evaluation(evaluation_results)
                              ],
                            )
          models.append(model)

      model_dict[target] = models
  return model_dict
model_dict = create_model_dict(targets,train)


## CV Score

In [407]:
# cv
import optuna
def cal_mcrmse(model_dict, targets):
    rmses = []
    for target in targets:
        models = model_dict[target]

        preds = []
        trues = []
        
        for fold, model in enumerate(models):
            X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns , inplace=False)
            y_eval_cv = train[train["fold"] == fold][target]

            pred = model.predict(X_eval_cv)

            trues.extend(y_eval_cv)
            preds.extend(pred)
            
        rmse = np.sqrt(mean_squared_error(trues, preds))
        print(f"{target}_rmse : {rmse}")
        rmses = rmses + [rmse]
    return sum(rmses) / len(rmses)
mcrmse = cal_mcrmse(model_dict, targets)
print(f"mcrmse: {mcrmse}")

content_rmse : 0.4131673554646339
wording_rmse : 0.5406696561968247
mcrmse: 0.4769185058307293


In [408]:
def objective(trial):
    weight_for_model = {}
    for model in CFG.list_model_infer:
        weight_for_model[model] = trial.suggest_float(model, 0.0, 1.0)
    scale = np.sum([weight_for_model[model] for model in CFG.list_model_infer])
    train[f"wording_pred"] = np.sum([ensembling_results_val[f"{model}_wording_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale    
    train[f"content_pred"] = np.sum([ensembling_results_val[f"{model}_content_pred"] * weight_for_model[model] for model in CFG.list_model_infer], axis=0) / scale
    model_dict = create_model_dict(targets,train)
    lost = cal_mcrmse(model_dict, targets)
    # print(f"mcrmse: {lost}")
    # print(f"weight_for_model: {weight_for_model}")
    return lost

In [409]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

In [410]:

# print('Best trial:')
# trial_ = study.best_trial

# print('Value: ', trial_.value)

# print('Params: ')
# for key, value in trial_.params.items():
#     print('    {}: {}'.format(key, value)) 

# content_rmse : 0.4131169758305616
# wording_rmse : 0.5421836129991084
# mcrmse: 0.47765029441483503


In [411]:

# Best trial:
# Value:  0.4769185058307293
# Params: 
#     upload_model/debertav3base_lr17e-05: 0.212909408976402
#     upload_model/debertav3base_lr18e-05: 0.1421852032796326
#     upload_model/debertav3base_lr21e-05: 0.8953672127134387
#     upload_model/debertav3base_lr22e-05: 0.36047950541959095
#     upload_model/debertav3large_lr12e-05: 0.9413883882391965
#     upload_model/debertav3large_lr13e-05: 0.48278638692252185
#     debertav3large_lr8e-06_att_0007: 0.30564935544213445
#     debertav3large_lr9e-06_att_0007: 0.40560485262757806
#     debertav3large_lr11e-05_att_0007: 0.882846667614644
#     debertav3large_lr12e-05_att_0007: 0.09507155361092456
#     debertav3large_lr13e-05_att_0007: 0.6241475562582361
#     debertav3large_lr14e-05_att_0007: 0.03402452758512511
#     debertav3large_lr15e-05_att_0007: 0.11628723374671882
#     debertav3large_lr16e-05_att_0007: 0.0001281844830617676
#     debertav3large_lr17e-05_att_0007: 0.969431755664122
#     debertav3large_lr18e-05_att_0007: 0.318050401669718

# Best trial:
# Value:  0.4772940289625304
# Params: 
#     upload_model/debertav3base_lr5e-05: 0.09347891502416127
#     upload_model/debertav3base_lr15e-05: 0.591661405535349
#     upload_model/debertav3base_lr17e-05: 0.2899256756091853
#     upload_model/debertav3base_lr18e-05: 0.5247554121196624
#     upload_model/debertav3base_lr21e-05: 0.06827852930559399
#     upload_model/debertav3base_lr22e-05: 0.386669811226943
#     upload_model/debertav3large_lr12e-05: 0.4819735230934519
#     upload_model/debertav3large_lr13e-05: 0.6277811009078926
#     debertav3large_lr1e-05_save: 0.8800293681553476
#     debertav3large_lr1e-05_att_0007: 0.907620186268483
#     debertav3large_lr8e-06_att_0007: 0.20560270128817018
#     debertav3large_lr9e-06_att_0007: 0.8283922073344201
#     debertav3large_lr11e-05_att_0007: 0.9434490377699455
#     debertav3large_lr12e-05_att_0007: 0.9568517774138272
#     debertav3large_lr13e-05_att_0007: 0.5510754606259485
#     debertav3large_lr14e-05_att_0007: 0.0505932089216867
#     debertav3large_lr15e-05_att_0007: 0.020756011719650247
#     debertav3large_lr16e-05_att_0007: 0.019564460078885
#     debertav3large_lr17e-05_att_0007: 0.9065148193079343
#     debertav3large_lr18e-05_att_0007: 0.3463884928585018

# Best trial:
# Value:  0.4771132861010911
# Params: 
#     upload_model/debertav3base_lr5e-05: 0.051723147603719516
#     upload_model/debertav3base_lr15e-05: 0.713986390124981
#     upload_model/debertav3base_lr17e-05: 0.21585195973104482
#     upload_model/debertav3base_lr18e-05: 0.4991352617947008
#     upload_model/debertav3base_lr21e-05: 0.004646508278479511
#     upload_model/debertav3base_lr22e-05: 0.11250582743280035
#     upload_model/debertav3large_lr12e-05: 0.6062309604440405
#     upload_model/debertav3large_lr13e-05: 0.41563814657040926
#     debertav3large_lr1e-05_save: 0.8788672880708215
#     debertav3large_lr1e-05_att_0007: 0.882589383882597
#     debertav3large_lr8e-06_att_0007: 0.7194970862228134
#     debertav3large_lr9e-06_att_0007: 0.6181074749297657
#     debertav3large_lr11e-05_att_0007: 0.9809193274307263
#     debertav3large_lr12e-05_att_0007: 0.17214549469931723
#     debertav3large_lr13e-05_att_0007: 0.4328122227545144
#     debertav3large_lr14e-05_att_0007: 0.3284035971778506
#     debertav3large_lr15e-05_att_0007: 0.9392121766363538
#     debertav3large_lr16e-05_att_0007: 0.033996448553002434
#     debertav3large_lr17e-05_att_0007: 0.8327105838357515
#     debertav3large_lr18e-05_att_0007: 0.8773681981314573

In [412]:
# content_rmse : 0.41518188998433053
# wording_rmse : 0.5438935328374503
# mcrmse : 0.4795377114108904 WITH WEIGHT 
# use 22 modell 


In [413]:
print(len(CFG.list_model_infer))

16


## Predict

In [414]:
drop_columns_2 = [
                # "fold", 
                "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text",
                "input"
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [415]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns_2)
        # print(X_eval_cv.head())
        pred = model.predict(X_eval_cv)
        # print('pred shape'  , pred.shape)
        preds.append(pred)
    
    pred_dict[target] = preds

[LightGBM] [Fatal] The number of features in data (13) is not the same as it was in training data (45).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.


LightGBMError: The number of features in data (13) is not the same as it was in training data (45).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.

: 

In [None]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

In [None]:
test

Unnamed: 0,student_id,prompt_id,text,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,...,wording_pred_1,content_pred_1,wording_pred_2,content_pred_2,wording_pred_3,content_pred_3,wording_pred,content_pred,content,wording
0,000000ffffff,abc123,Example text 1,3,Example text 1,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.338309,-1.523791,-1.190337,-1.436432,-1.349174,-1.540069,-1.532378,-1.339146,-1.495245,-1.311898
1,111111eeeeee,def789,Example text 2,3,Example text 2,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.338309,-1.523791,-1.190337,-1.436432,-1.349174,-1.540069,-1.534876,-1.340151,-1.495245,-1.311898
2,222222cccccc,abc123,Example text 3,3,Example text 3,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.343294,-1.520616,-1.190337,-1.436432,-1.349174,-1.540069,-1.533215,-1.345367,-1.494451,-1.313144
3,333333dddddd,def789,Example text 4,3,Example text 4,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.338309,-1.523791,-1.190337,-1.436432,-1.349174,-1.540069,-1.535747,-1.341987,-1.495245,-1.311898


## Create Submission file

In [None]:
sample_submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,0.0,0.0
1,111111eeeeee,0.0,0.0
2,222222cccccc,0.0,0.0
3,333333dddddd,0.0,0.0


In [None]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)

## Summary

CV result is like this.

| | content rmse |wording rmse | mcrmse | LB| |
| -- | -- | -- | -- | -- | -- |
|baseline| 0.494 | 0.630 | 0.562 | 0.509 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-baseline-content-and-wording-models)|
| use title and question field | 0.476| 0.619 | 0.548 | 0.508 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-w-prompt-title-question-fields) |
| Debertav3 + LGBM | 0.451 | 0.591 | 0.521 | 0.461 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-lgbm-with-feature-engineering) |
| Debertav3 + LGBM with spell autocorrect | 0.448 | 0.581 | 0.514 | 0.459 |nogawanogawa's original code
| Debertav3 + LGBM with spell autocorrect and tuning | 0.442 | 0.566 | 0.504 | 0.453 | this notebook |

The CV values improved slightly, and the LB value is improved.