In [1]:
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl" 
!pip install "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"

Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2
Processing /kaggle/input/autocorrect/autocorrect-2.6.1.tar
  Preparing metadata (setup.py) ... [?25l- \ done
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l- \ done
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622383 sha256=84c1c776407c0e7aaf4b6e1e27623f05bee088365c87461590040d4dbc21ef37
  Stored in directory: /root/.cache/pip/wheels/db/69/42/0fb0421d2fe70d195a04665edc760cfe5fd341d7bb8d8e0aaa
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1


In [2]:
from typing import List
import numpy as np
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re 

from autocorrect import Speller
from spellchecker import SpellChecker
 
import xgboost as xgb  
import lightgbm as lgb

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [3]:
!cp -r /kaggle/input/rouge-score/rouge_score-0.1.2 /kaggle/working/
!pip install /kaggle/working/rouge_score-0.1.2/

Processing ./rouge_score-0.1.2
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24954 sha256=7d0ed42b57dd23e705b0aba6ec45294af96f1e69216d8b3a379cb9602a65865e
  Stored in directory: /root/.cache/pip/wheels/7f/65/32/aad94dce2cf8bf1ffd82591fee13cf08e3795101366d68f144
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [4]:
from rouge_score import rouge_scorer
from itertools import combinations

def sentence_combinations(text):
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    pairs = list(combinations(sentences, 2))
    return pairs

scorer_rogue = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)

def get_rogue_f1s(text_t,text_p):
    scores = scorer_rogue.score(text_t,text_p)
    rg1 = scores['rouge1'][2]
    rg2 = scores['rouge2'][2]
    rgL = scores['rougeL'][2]
    return (rg1,rg2,rgL)
    

def get_avg_self_rogues(text):
    rg1s = []
    rg2s = []
    rgLs = []
    for sp in sentence_combinations(text):
        scores = scorer_rogue.score(sp[0],sp[1])
        rg1s.append(scores['rouge1'][2])
        rg2s.append(scores['rouge2'][2])
        rgLs.append(scores['rougeL'][2])
    return ( np.mean(rg1s),np.mean(rg2s),np.mean(rgLs) )


import nltk
from nltk.tokenize import sent_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def self_bleu(text):
    sentences = sent_tokenize(text)
    scores = []
    smoothie = SmoothingFunction().method4
    for idx, candidate in enumerate(sentences):
        references = [s for i, s in enumerate(sentences) if i != idx]
        scores.append(sentence_bleu(references, candidate, smoothing_function=smoothie))
    average_score = sum(scores) / len(scores)
    return average_score

In [5]:
load_data = '/kaggle/input/commonlit-evaluate-student-summaries/' 


In [6]:
prompts_train = pd.read_csv(load_data + 'prompts_train.csv') 
summaries_train = pd.read_csv(load_data + 'summaries_train.csv') 

In [7]:
prompts_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summaries_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

In [8]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [9]:
class CFG:
    model_name="debertav3base"
    learning_rate=0.000015   #0.000015
    weight_decay=0.02        #0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=4 #4
    n_splits=4
    batch_size=12
    random_seed=42
    save_steps=100
    max_length=512

In [10]:
model_name="debertav3base"
from transformers import AutoTokenizer


In [11]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{model_name}")
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        # input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=CFG.model_name)

In [12]:
train = preprocessor.run(prompts_train, summaries_train, mode="train")
test = preprocessor.run(prompts_test, summaries_test, mode="test")

train.head()

100%|██████████| 7165/7165 [06:43<00:00, 17.78it/s]
100%|██████████| 7165/7165 [00:01<00:00, 6527.00it/s]
100%|██████████| 7165/7165 [00:00<00:00, 9589.17it/s]
100%|██████████| 7165/7165 [00:01<00:00, 5094.12it/s]
100%|██████████| 7165/7165 [00:01<00:00, 4526.85it/s]
100%|██████████| 7165/7165 [00:00<00:00, 86826.42it/s]
100%|██████████| 4/4 [00:00<00:00, 4123.18it/s]
100%|██████████| 4/4 [00:00<00:00, 6396.19it/s]
100%|██████████| 4/4 [00:00<00:00, 2347.78it/s]
100%|██████████| 4/4 [00:00<00:00, 3451.39it/s]
100%|██████████| 4/4 [00:00<00:00, 3088.59it/s]
100%|██████████| 4/4 [00:00<00:00, 2953.74it/s]


Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,14,4,0.063492,0,0.0,0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1076,18,22,0.415094,10,0.192308,0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,"In Egypt, there were many occupations and soci...",32,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,22,52,0.19403,23,0.086142,2
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,The highest class was Pharaohs these people we...,5,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,6,6,0.222222,5,0.192308,0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,The Third Wave developed rapidly because the ...,29,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,23,27,0.116883,5,0.021739,4


In [13]:
gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,14,4,0.063492,0,0.0,0,3.0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1076,18,22,0.415094,10,0.192308,0,2.0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,"In Egypt, there were many occupations and soci...",32,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,22,52,0.19403,23,0.086142,2,1.0
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,The highest class was Pharaohs these people we...,5,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,6,6,0.222222,5,0.192308,0,1.0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,The Third Wave developed rapidly because the ...,29,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,23,27,0.116883,5,0.021739,4,3.0


# Model function

In [14]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

# Deberta Regressor

In [15]:
class ContentScoreRegressor:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "fixed_summary_text"]
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = [target]

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        
        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{model_name}")
        self.model_config = AutoConfig.from_pretrained(f"/kaggle/input/{model_name}")
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression",
        })
        
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def train(self, 
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""
        
        sep = self.tokenizer.sep_token
        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep 
                    + train_df["prompt_question"] + sep 
                    + train_df["fixed_summary_text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep 
                    + valid_df["prompt_question"] + sep 
                    + valid_df["fixed_summary_text"]
                  )
        
        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]
        
        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"/kaggle/input/{self.model_name}", 
            config=self.model_config
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
    
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )

        trainer.train()
        
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["fixed_summary_text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 

        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

In [16]:
def train_by_fold(
        train_df: pd.DataFrame,
        model_name: str,
        target:str,
        save_each_model: bool,
        n_splits: int,
        batch_size: int,
        learning_rate: int,
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        weight_decay: float,
        num_train_epochs: int,
        save_steps: int,
        max_length:int
    ):

    # delete old model files
    if os.path.exists(model_name):
        shutil.rmtree(model_name)
    
    os.mkdir(model_name)
        
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        train_data = train_df[train_df["fold"] != fold]
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        csr.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data, 
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )

def validate(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"
        
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"{target}_pred"] = pred

    return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"{target}_pred_{fold}"] = pred
    
    test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [17]:
for target in ["content", "wording"]:
    train_by_fold(
        train,
        model_name=CFG.model_name,
        save_each_model=False,
        target=target,
        learning_rate=CFG.learning_rate,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        weight_decay=CFG.weight_decay,
        num_train_epochs=CFG.num_train_epochs,
        n_splits=CFG.n_splits,
        batch_size=CFG.batch_size,
        save_steps=CFG.save_steps,
        max_length=CFG.max_length
    )
    
    
    train = validate(
        train,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

    rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

    test = predict(
        test,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

fold 0:


Step,Training Loss,Validation Loss,Rmse
100,No log,0.231067,0.480695
200,No log,0.258009,0.507946
300,No log,0.239572,0.489461
400,No log,0.494818,0.703433
500,0.273600,0.482845,0.69487
600,0.273600,0.193543,0.439935
700,0.273600,0.278226,0.527471
800,0.273600,0.343185,0.58582
900,0.273600,0.301421,0.549018
1000,0.150400,0.356003,0.59666


fold 1:


Step,Training Loss,Validation Loss,Rmse
100,No log,0.330423,0.574824
200,No log,0.308816,0.555712
300,No log,0.2816,0.53066
400,No log,0.267217,0.51693
500,0.264700,0.285263,0.5341
600,0.264700,0.275161,0.524558
700,0.264700,0.295855,0.543925
800,0.264700,0.282145,0.531173
900,0.264700,0.257316,0.507263
1000,0.147700,0.277902,0.527164


fold 2:


Step,Training Loss,Validation Loss,Rmse
100,No log,0.270935,0.520514
200,No log,0.23194,0.481601
300,No log,0.229905,0.479484
400,No log,0.264982,0.514764
500,0.266900,0.241694,0.491624
600,0.266900,0.236737,0.486556
700,0.266900,0.283323,0.532281
800,0.266900,0.239058,0.488936
900,0.266900,0.381927,0.618003
1000,0.149000,0.449012,0.670083


fold 3:


Step,Training Loss,Validation Loss,Rmse
100,No log,0.4135,0.64304
200,No log,0.644407,0.802749
300,No log,0.368069,0.606687
400,No log,0.303894,0.551266
500,0.249200,0.354082,0.595048
600,0.249200,0.619232,0.786913
700,0.249200,0.484761,0.696248
800,0.249200,0.332791,0.57688
900,0.249200,0.376372,0.613492
1000,0.144300,0.46043,0.67855


fold 0:


fold 1:


fold 2:


fold 3:


cv content rmse: 0.4884072888272623
fold 0:


fold 1:


fold 2:


fold 3:


fold 0:


Step,Training Loss,Validation Loss,Rmse
100,No log,0.599047,0.773981
200,No log,0.392314,0.62635
300,No log,0.376464,0.613566
400,No log,0.331142,0.575449
500,0.469300,0.335705,0.5794
600,0.469300,0.336641,0.580208
700,0.469300,0.304208,0.551551
800,0.469300,0.321962,0.567417
900,0.469300,0.307394,0.554431
1000,0.261500,0.305302,0.552541


fold 1:


Step,Training Loss,Validation Loss,Rmse
100,No log,0.661426,0.813281
200,No log,0.612108,0.782373
300,No log,0.654053,0.808736
400,No log,0.650458,0.80651
500,0.397500,0.728739,0.853662
600,0.397500,0.745287,0.8633
700,0.397500,0.845682,0.91961
800,0.397500,0.645473,0.803413
900,0.397500,0.745909,0.86366
1000,0.231200,0.792565,0.890261


fold 2:


Step,Training Loss,Validation Loss,Rmse
100,No log,0.590249,0.768277
200,No log,0.469994,0.685561
300,No log,0.47139,0.686578
400,No log,0.389535,0.624127
500,0.470300,0.391704,0.625863
600,0.470300,0.517236,0.719191
700,0.470300,0.34674,0.588846
800,0.470300,0.352928,0.594077
900,0.470300,0.316299,0.562405
1000,0.290000,0.297514,0.545449


fold 3:


Step,Training Loss,Validation Loss,Rmse
100,No log,0.725243,0.851612
200,No log,0.555207,0.745122
300,No log,0.552666,0.743415
400,No log,0.469641,0.685304
500,0.411600,0.588652,0.767237
600,0.411600,0.459256,0.677684
700,0.411600,0.575464,0.758594
800,0.411600,0.453972,0.673775
900,0.411600,0.535899,0.732051
1000,0.255600,0.433938,0.65874


fold 0:


fold 1:


fold 2:


fold 3:


cv wording rmse: 0.6266705883625747
fold 0:


fold 1:


fold 2:


fold 3:


In [18]:
CFG.model_name

'debertav3base'

In [19]:
#model_dir

In [20]:
#!zip -r working.zip /kaggle/working/

In [21]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tqdm import tqdm
import os
import zipfile
from zipfile import ZipFile
import shutil
import pickle
import nltk
from nltk.tokenize import sent_tokenize
import shap

import sys
sys.path.append('/kaggle/input/sentence-transformers-222/sentence-transformers')
from sentence_transformers import SentenceTransformer, models
st_model = SentenceTransformer('/kaggle/input/sentence-transformers-222/all-MiniLM-L6-v2')

!pip install "/kaggle/input/language-tool-python/language_tool_python-2.7.1-py3-none-any.whl"

# create download path
def get_language_tool_cache_path():
    # Get download path from environment or use default.
    download_path = os.environ.get(
        'LTP_PATH',
        os.path.join(os.path.expanduser("~"), ".cache", "language_tool_python")
    )
    # Make download path, if it doesn't exist.
    os.makedirs(download_path, exist_ok=True)
    return download_path

lt_path = get_language_tool_cache_path()
lt_path

#cant move files directly from input to cache, so we zip it to output and unzip again


def get_all_file_paths(directory):
  
    # initializing empty file paths list
    file_paths = []
  
    # crawling through directory and subdirectories
    for root, directories, files in os.walk(directory):
        for filename in files:
            # join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)
  
    # returning all file paths
    return file_paths        
  
def main():
    # path to folder which needs to be zipped
    directory = '/kaggle/input/languagetool57imroze/LanguageTool-5.7'
  
    # calling function to get all file paths in the directory
    file_paths = get_all_file_paths(directory)

    # writing files to a zipfile
    with ZipFile('./lt.zip','w') as zip:
        # writing each file one by one
        for file in file_paths:
            zip.write(file)
  
    print('All files zipped successfully!')        
  
main()


 
zip_file = "./lt.zip"
 
try:
    with zipfile.ZipFile(zip_file) as z:
        z.extractall()
        print("Extracted all")
except:
    print("Invalid file")
    
#move to cache
!mv {'/kaggle/input/languagetool57imroze/LanguageTool-5.7'} {lt_path} 
os.listdir('/root/.cache/language_tool_python/')

os.remove('/kaggle/working/lt.zip')

import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

!pip install "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"
!pip install "/kaggle/input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl"
!pip install "/kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl"

Processing /kaggle/input/language-tool-python/language_tool_python-2.7.1-py3-none-any.whl
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.7.1
All files zipped successfully!
Extracted all
mv: cannot remove '/kaggle/input/languagetool57imroze/LanguageTool-5.7/testrules.sh': Read-only file system
mv: cannot remove '/kaggle/input/languagetool57imroze/LanguageTool-5.7/META-INF/maven/org.languagetool/language-de/pom.properties': Read-only file system
mv: cannot remove '/kaggle/input/languagetool57imroze/LanguageTool-5.7/META-INF/maven/org.languagetool/language-de/pom.xml': Read-only file system
mv: cannot remove '/kaggle/input/languagetool57imroze/LanguageTool-5.7/META-INF/maven/org.languagetool/language-zh/pom.properties': Read-only file system
mv: cannot remove '/kaggle/input/languagetool57imroze/LanguageTool-5.7/META-INF/maven/org.languagetool/language-zh/pom.xml': Read-only file system
mv: cannot remove '/kaggle/input/languagetool

In [22]:
df_train_p=pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
df_train_s=pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")

df_train_s=pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")

from textblob import TextBlob

def spell_correction_tb(text):
    text = TextBlob(text)
    result = text.correct()
    return str(result)

from autocorrect import Speller
from spellchecker import SpellChecker
from typing import List

speller = Speller(lang='en')
spellchecker = SpellChecker()

def spelling(text):
    wordlist=text.split()
    amount_miss = len(list(spellchecker.unknown(wordlist)))
    return amount_miss

def add_spelling_dictionary(tokens: List[str]) -> List[str]:
    """dictionary update for pyspell checker and autocorrect"""
    spellchecker.word_frequency.load_words(tokens)
    speller.nlp_data.update({token:1000 for token in tokens})
    
import spacy
nlp = spacy.load("en_core_web_sm")

def get_coherence_complexities(text):
    doc = nlp(text)
    # Calculate the average sentence length
    #num_sentences = len(list(doc.sents))
    num_sentences = len(sent_tokenize(text.replace(".",". ")))
    num_words = len(doc)
    avg_sentence_length = num_words / num_sentences
    # Calculate the coherence of discourse
    num_connectives = len([token for token in doc if token.dep_ == "mark"])
    try:
        #coherence = num_connectives / (num_sentences - 1)
        coherence = num_connectives / (num_sentences)
    except ZeroDivisionError:
        coherence = 0
    # Calculate the lexical and grammatical complexity
    num_named_entities = len(doc.ents)
    num_dependent_clauses = len([token for token in doc if token.dep_ == "mark" or token.dep_ == "advcl"])
    num_compound_nouns = len([token for token in doc if token.pos_ == "NOUN" and token.dep_ == "compound"])
    num_proper_nouns = len([token for token in doc if token.pos_ == "PROPN"])
    num_adjectives = len([token for token in doc if token.pos_ == "ADJ"])
    num_adverbs = len([token for token in doc if token.pos_ == "ADV"])
    num_prepositions = len([token for token in doc if token.pos_ == "ADP"])
    num_conjunctions = len([token for token in doc if token.pos_ == "CCONJ"])
    num_pronouns = len([token for token in doc if token.pos_ == "PRON"])
    lexical_complexity = (num_named_entities + num_dependent_clauses + num_compound_nouns + num_proper_nouns + num_adjectives + num_adverbs) / num_words
    grammatical_complexity = (num_prepositions + num_conjunctions + num_pronouns) / num_words
    
    return (coherence,lexical_complexity,grammatical_complexity)

import textstat

def text_complexity(text):
    score=textstat.flesch_reading_ease(text)
    if(score>=0 and score<=29):
        #print("Very Confusing")
        return ([score,"Very Confusing"])
    elif(score>=30 and score<=49):
        #print("Difficult")
        return ([score,"Difficult"])
    elif(score>=50 and score<=59):
        #print("Fairly Difficult")
        return ([score,"Fairly Difficult"])
    elif(score>=60 and score<=69):
        #print("Standard")
        return ([score,"Standard"])
    elif(score>=70 and score<=79):
        #print("Fairly Easy")
        return ([score,"Fairly Easy"])
    elif(score>=80 and score<=89):
        #print("Easy")
        return ([score,"Easy"])
    elif(score>=90 and score<=100):
        #print("Very Easy")
        return ([score,"Very Easy"])
    return ([0,'Very Confusing'])


def spelling_grammar_mistakes(text):
    spelling_counts=0
    grammar_counts=0
    matches = tool.check(str(text))
    
    for i in range(len(matches)):
        #print('matches',i,matches[i])
        if(matches[i].ruleId=='MORFOLOGIK_RULE_EN_US'):
            """
            cntxt = list(matches[i].context)
            cntxt[matches[i].offset]='>'
            s = ''
            for v in cntxt:
                s+=v
            print(s)
            """
            spelling_counts+=1
        else:
            grammar_counts+=1
            
    return {'spelling_errors':spelling_counts,'grammar_errors':grammar_counts}

In [23]:


def extract_answer_features(input_answer):
    input_answer = input_answer.lower()
    tokenized_sentences = sent_tokenize(input_answer.replace(".",". "))
    #print(tokenized_sentences)
    num_sentences = len(tokenized_sentences)
    senten_lens = [ len(cur_sent.split(' ')) for cur_sent in tokenized_sentences ]
    #print(senten_lens)
    res = spelling_grammar_mistakes(input_answer)
    spelling_errors = float(res['spelling_errors']) / float(num_sentences)
    grammar_errors = float(res['grammar_errors']) / float(num_sentences)
    #input_answer_FTBSL = spell_correction_tb(input_answer)
    input_answer_FTBSL = speller(input_answer)
    #print(input_answer_FTBSL)
    answer_ts_complexity = text_complexity(input_answer_FTBSL)
    answer_ts_difficwords = textstat.difficult_words(input_answer_FTBSL)
    coherence,lexical_complexity,grammatical_complexity = get_coherence_complexities(input_answer_FTBSL)
    answer_len = len(input_answer_FTBSL.split(' '))
    feat_dict = dict()
    feat_dict['coherence']=coherence
    feat_dict['syl_complexity']=answer_ts_complexity[0]
    feat_dict['lex_complexity']=lexical_complexity
    feat_dict['gramm_complexity']=grammatical_complexity
    feat_dict['diffic_words']=answer_ts_difficwords / float(num_sentences)
    feat_dict['spelling_errors']=spelling_errors / float(num_sentences)
    feat_dict['grammar_errors']=grammar_errors / float(num_sentences)
    feat_dict['num_sentences']=num_sentences
    feat_dict['answer_len']=answer_len
    feat_dict['senten_len_max']=max(senten_lens)
    feat_dict['senten_len_min']=min(senten_lens)
    feat_dict['senten_len_avg']=np.mean(senten_lens)
    
    return feat_dict


def extract_answer_features2(input_answer,prompt_question,prompt_text):
    input_answer = input_answer.lower()
    tokenized_sentences = sent_tokenize(input_answer.replace(".",". ").replace("  ","."))
    #print(tokenized_sentences)
    num_sentences = len(tokenized_sentences)
    senten_lens = [ len(cur_sent.split(' ')) for cur_sent in tokenized_sentences ]
    #print(senten_lens)
    res = spelling_grammar_mistakes(input_answer)
    spelling_errors = float(res['spelling_errors']) / float(num_sentences)
    grammar_errors = float(res['grammar_errors']) / float(num_sentences)
    #input_answer_FTBSL = spell_correction_tb(input_answer)
    input_answer_FTBSL = speller(input_answer)
    #print(input_answer_FTBSL)
    answer_ts_complexity = text_complexity(input_answer_FTBSL)
    answer_ts_difficwords = textstat.difficult_words(input_answer_FTBSL)
    coherence,lexical_complexity,grammatical_complexity = get_coherence_complexities(input_answer_FTBSL)
    answer_len = len(input_answer_FTBSL.split(' '))
    
    ans_words_list = word_tokenize(input_answer_FTBSL.replace(".","").replace(",","").replace("  ",""))
    prompt_words_list = word_tokenize(prompt_text.replace(".","").replace(",","").replace("  ",""))
    ans_words_set = set(ans_words_list)
    prompt_words_set = set(prompt_words_list)
    
    self_novelty = len(ans_words_set)/len(ans_words_list)
    novelty = len(ans_words_set.difference(prompt_words_set))
    
    rg1 = -10
    rg2 = -10
    rgL = -10
    try:
        rg1,rg2,rgL = get_rogue_f1s(prompt_text,input_answer_FTBSL)
    except:
        pass
    
    try:
        self_bleu_score = self_bleu(input_answer_FTBSL)
    except:
        self_bleu_score = -10
        
    rg1S = -10
    rg2S = -10
    rgLS = -10
    try:
        rg1S,rg2S,rgLS = get_avg_self_rogues(input_answer_FTBSL)
    except:
        pass
    
    len_ratio = len(prompt_text)/len(input_answer)
    
    feat_dict = dict()
    feat_dict['coherence']=coherence
    feat_dict['syl_complexity']=answer_ts_complexity[0]
    feat_dict['lex_complexity']=lexical_complexity
    feat_dict['gramm_complexity']=grammatical_complexity
    feat_dict['diffic_words']=answer_ts_difficwords / float(num_sentences)
    feat_dict['spelling_errors']=spelling_errors / float(num_sentences)
    feat_dict['grammar_errors']=grammar_errors / float(num_sentences)
    feat_dict['num_sentences']=num_sentences
    feat_dict['answer_len']=answer_len
    feat_dict['senten_len_max']=max(senten_lens)
    feat_dict['senten_len_min']=min(senten_lens)
    feat_dict['senten_len_avg']=np.mean(senten_lens)
    feat_dict['senten_len_std']=np.std(senten_lens)
    
    feat_dict['self_novelty']=self_novelty
    feat_dict['novelty']=novelty
    feat_dict['rg1']=rg1
    feat_dict['rg2']=rg2
    feat_dict['rgL']=rgL
    feat_dict['rg1S']=rg1S
    feat_dict['rg2S']=rg2S
    feat_dict['rgLS']=rgLS
    feat_dict['self_bleu']=self_bleu_score
    feat_dict['len_ratio']=len_ratio
    
    return feat_dict


# 'self_bleu','self_novelty','novelty','rg1','rgL','len_ratio'

In [24]:
#df_train_s = pd.read_csv('/kaggle/input/summaries-nlp-features/summaries_nlp_feats.csv')
df_train_s = pd.read_csv('/kaggle/input/summaries-nlp-features-v2/summaries_nlp_feats3.csv')

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score

In [26]:
import lightgbm as lgb

df = df_train_s


X = df[['coherence','syl_complexity', 'lex_complexity', 'gramm_complexity', 'diffic_words',
        'spelling_errors', 'grammar_errors', 'num_sentences', 'answer_len',
        'senten_len_max', 'senten_len_min', 'senten_len_avg']]

"""
X = df[['syl_complexity', 'lex_complexity', 'gramm_complexity','spelling_errors', 'grammar_errors']]
"""

#X = df[['syl_complexity', 'lex_complexity', 'gramm_complexity', 'grammar_errors', 'answer_len','senten_len_avg']]


y = df['wording']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_test, label=y_test)

params = {
    'boosting_type': 'gbdt',
    'random_state': 42,
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05, #0.06
    'max_depth': 5,  #5
    'lambda_l1': 0.0,
    'lambda_l2': 0.005 #0.006
}

evaluation_results = {}
model_wording = lgb.train(params,
                  num_boost_round=10000,
                    #categorical_feature = categorical_features,
                  valid_names=['train', 'valid'],
                  train_set=dtrain,
                  valid_sets=dval,
                  callbacks=[
                      lgb.early_stopping(stopping_rounds=40, verbose=False),
                       lgb.log_evaluation(100),
                      lgb.callback.record_evaluation(evaluation_results)
                    ],
                  )

# Predictions
y_pred = model_wording.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(i,f"Mean Squared Error: {mse:.3f}")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2119
[LightGBM] [Info] Number of data points in the train set: 6806, number of used features: 12
[LightGBM] [Info] Start training from score -0.061884
[100]	train's rmse: 0.762775
[200]	train's rmse: 0.756996
3 Mean Squared Error: 0.569


In [27]:
df = df_train_s
X = df[['coherence','syl_complexity', 'lex_complexity', 'gramm_complexity', 'diffic_words',
        'spelling_errors', 'grammar_errors', 'num_sentences',
        'senten_len_max', 'senten_len_min', 'senten_len_avg']]
y = df['content']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_test, label=y_test)

params = {
    'boosting_type': 'gbdt',
    'random_state': 42,
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05, #0.048 0.06
    'max_depth': 3,  #4
    'lambda_l1': 0.0,
    'lambda_l2': 0.01 #0.011
}

evaluation_results = {}
model_content = lgb.train(params,
                  num_boost_round=10000,
                    #categorical_feature = categorical_features,
                  valid_names=['train', 'valid'],
                  train_set=dtrain,
                  valid_sets=dval,
                  callbacks=[
                      lgb.early_stopping(stopping_rounds=40, verbose=False),
                       lgb.log_evaluation(100),
                      lgb.callback.record_evaluation(evaluation_results)
                    ],
                  )

# Predictions
y_pred = model_content.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(i,f"Mean Squared Error: {mse:.3f}")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1865
[LightGBM] [Info] Number of data points in the train set: 6806, number of used features: 11
[LightGBM] [Info] Start training from score -0.011564
[100]	train's rmse: 0.543438
[200]	train's rmse: 0.535647
3 Mean Squared Error: 0.285


In [28]:
prompts_test

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,prompt_length,prompt_tokens
0,abc123,Summarize...,Example Title 1,Heading\nText...,3,"[Heading, Text, ...]"
1,def789,Summarize...,Example Title 2,Heading\nText...,3,"[Heading, Text, ...]"


In [29]:
prompt_dict = dict()

for i in range(len(prompts_test)):
    pid = prompts_test['prompt_id'].iloc[i]
    pttl = prompts_test['prompt_question'].iloc[i]
    ptxt = prompts_test['prompt_text'].iloc[i]
    prompt_dict[pid] = [pttl,ptxt]

In [30]:
list(prompt_dict.keys())

['abc123', 'def789']

In [31]:
sample_submission = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")
test_df = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")

ss = 'He is goeng home. He will riturn soon.'
ss2 = 'He is goeng home. He will not riturn soon.'
feat_dict = extract_answer_features2(ss,ss2,ss2)

feat_dict_data = { k:[] for k in list(feat_dict.keys())  }
for i in tqdm(range( len(test_df) )) :
    text = test_df['text'].iloc[i]
    #feat_dict = extract_answer_features(text)
    pid = test_df['prompt_id'].iloc[i]
    prompt_question = prompt_dict[pid][0]
    prompt_text = prompt_dict[pid][1]
    feat_dict = extract_answer_features2(text,prompt_question,prompt_text)
    for k in list(feat_dict.keys()):
        feat_dict_data[k].append( feat_dict[k] )
        
for k in list(feat_dict_data.keys()):
    test_df[k] = feat_dict_data[k]

100%|██████████| 4/4 [00:00<00:00, 12.00it/s]


In [32]:
content_preds = []
for i in tqdm(range(len(test_df))):
    feats = []
    for k in ['coherence','syl_complexity', 'lex_complexity', 'gramm_complexity', 'diffic_words',
        'spelling_errors', 'grammar_errors', 'num_sentences',
        'senten_len_max', 'senten_len_min', 'senten_len_avg']:
        feats.append( float(test_df[k].iloc[i])  )
    pred = float(model_content.predict( np.array([feats]) )[0])
    content_preds.append(pred)

100%|██████████| 4/4 [00:00<00:00, 1457.24it/s]


In [33]:
wording_preds = []
for i in tqdm(range(len(test_df))):
    feats = []
    for k in ['coherence','syl_complexity', 'lex_complexity', 'gramm_complexity', 'diffic_words',
        'spelling_errors', 'grammar_errors', 'num_sentences', 'answer_len',
        'senten_len_max', 'senten_len_min', 'senten_len_avg']:
        feats.append( float(test_df[k].iloc[i])  )
    pred = float(model_wording.predict( np.array([feats]) )[0])
    wording_preds.append(pred)

100%|██████████| 4/4 [00:00<00:00, 1815.72it/s]


In [34]:
test_df['content_preds'] = content_preds
test_df['wording_preds'] = wording_preds
sample_submission['content'] = test_df['content_preds'].values
sample_submission['wording'] = test_df['wording_preds'].values

In [35]:
sample_submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.497897,-0.812235
1,111111eeeeee,-1.497897,-0.812235
2,222222cccccc,-1.497897,-0.812235
3,333333dddddd,-1.497897,-0.812235


In [36]:
!rm -r {lt_path}

In [37]:
try:
    shutil.rmtree('/kaggle/working/kaggle/input/languagetool57imroze/LanguageTool-5.7')
    shutil.rmtree('/kaggle/working/kaggle/input')
except:
    print('exception 1')

In [38]:
try:
    shutil.rmtree('/kaggle/working/rouge_score-0.1.2')
except:
    print('exception 2')

In [39]:
try:
    shutil.rmtree('/kaggle/working/rouge_score-0.1.2')
except:
    print('exception 3')

exception 3


In [40]:
try:
    shutil.rmtree('/kaggle/working/debertav3base')
except:
    print('exception 4')

# LGB MODEL

In [41]:
train.head(3)

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,...,prompt_length,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count,fold,content_pred,wording_pred
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,...,660,14,4,0.063492,0,0.0,0,3.0,-0.03002,0.686377
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,...,1076,18,22,0.415094,10,0.192308,0,2.0,-0.722237,-0.250455
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,"In Egypt, there were many occupations and soci...",32,"In complete sentences, summarize the structure...",Egyptian Social Structure,...,625,22,52,0.19403,23,0.086142,2,1.0,2.43931,1.904997


In [42]:
test.head(3)

Unnamed: 0,student_id,prompt_id,text,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,...,content_pred_0,content_pred_1,content_pred_2,content_pred_3,content,wording_pred_0,wording_pred_1,wording_pred_2,wording_pred_3,wording
0,000000ffffff,abc123,Example text 1,3,Example text 1,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.521437,-1.609047,-1.402371,-1.300265,-1.45828,-1.459463,-1.425489,-1.339818,-1.385726,-1.402624
1,111111eeeeee,def789,Example text 2,3,Example text 2,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.523067,-1.611609,-1.403302,-1.294887,-1.458216,-1.468567,-1.424382,-1.352623,-1.389704,-1.408819
2,222222cccccc,abc123,Example text 3,3,Example text 3,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.541403,-1.616524,-1.407259,-1.313284,-1.469617,-1.454117,-1.441295,-1.352539,-1.390533,-1.409621


In [43]:
df_train_s.head(3)

Unnamed: 0,student_id,prompt_id,text,content,wording,coherence,syl_complexity,lex_complexity,gramm_complexity,diffic_words,...,self_novelty,novelty,rg1,rg2,rgL,rg1S,rg2S,rgLS,self_bleu,len_ratio
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,0.5,18.02,0.28125,0.15625,11.0,...,0.803279,26,0.114804,0.024242,0.072508,0.19758,0.0,0.163741,0.459545,10.306358
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,0.0,61.0,0.145455,0.290909,3.0,...,0.673077,2,0.095238,0.044791,0.073858,0.307692,0.08,0.192308,0.264532,21.04918
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,0.166667,0.0,0.152727,0.210909,38.0,...,0.5875,74,0.407547,0.100883,0.206289,0.157057,0.005565,0.122904,0.644615,2.441606


In [44]:
test_df.head(3)

Unnamed: 0,student_id,prompt_id,text,coherence,syl_complexity,lex_complexity,gramm_complexity,diffic_words,spelling_errors,grammar_errors,...,rg1,rg2,rgL,rg1S,rg2S,rgLS,self_bleu,len_ratio,content_preds,wording_preds
0,000000ffffff,abc123,Example text 1,0.0,0,0.333333,0.0,1.0,0.0,1.0,...,0.4,0.0,0.4,,,,-10,1.071429,-1.497897,-0.812235
1,111111eeeeee,def789,Example text 2,0.0,0,0.333333,0.0,1.0,0.0,1.0,...,0.4,0.0,0.4,,,,-10,1.071429,-1.497897,-0.812235
2,222222cccccc,abc123,Example text 3,0.0,0,0.333333,0.0,1.0,0.0,1.0,...,0.4,0.0,0.4,,,,-10,1.071429,-1.497897,-0.812235


In [45]:
#cols2use = ['coherence', 'syl_complexity', 'lex_complexity', 'gramm_complexity', 'diffic_words', 'spelling_errors', 'grammar_errors', 'num_sentences', 'answer_len', 'senten_len_max', 'senten_len_min', 'senten_len_avg']

In [46]:
#cols2use = ['syl_complexity', 'lex_complexity', 'gramm_complexity', 'grammar_errors','diffic_words', 'spelling_errors']

In [47]:
#cols2use = ['syl_complexity', 'lex_complexity', 'gramm_complexity', 'grammar_errors','diffic_words', 'spelling_errors', 'self_bleu','self_novelty','novelty','rg1','rgL']

In [48]:
cols2use = ['syl_complexity', 'lex_complexity', 'gramm_complexity', 'grammar_errors','diffic_words', 'spelling_errors', 'self_bleu','self_novelty']

In [49]:
#,'len_ratio'

In [50]:
for colname in cols2use:
    train[colname] = df_train_s[colname].values
    test[colname] = test_df[colname].values

In [51]:
"""
content_preds = []
for i in tqdm(range(len(train))):
    feats = []
    for k in ['coherence','syl_complexity', 'lex_complexity', 'gramm_complexity', 'diffic_words',
        'spelling_errors', 'grammar_errors', 'num_sentences',
        'senten_len_max', 'senten_len_min', 'senten_len_avg']:
        feats.append( float(train[k].iloc[i])  )
    pred = float(model_content.predict( np.array([feats]) )[0])
    content_preds.append(pred)
    
    
wording_preds = []
for i in tqdm(range(len(train))):
    feats = []
    for k in ['coherence','syl_complexity', 'lex_complexity', 'gramm_complexity', 'diffic_words',
        'spelling_errors', 'grammar_errors', 'num_sentences', 'answer_len',
        'senten_len_max', 'senten_len_min', 'senten_len_avg']:
        feats.append( float(train[k].iloc[i])  )
    pred = float(model_wording.predict( np.array([feats]) )[0])
    wording_preds.append(pred)
    
    
train['content_preds2'] = content_preds
train['wording_preds2'] = wording_preds


content_preds = []
for i in tqdm(range(len(test))):
    feats = []
    for k in ['coherence','syl_complexity', 'lex_complexity', 'gramm_complexity', 'diffic_words',
        'spelling_errors', 'grammar_errors', 'num_sentences',
        'senten_len_max', 'senten_len_min', 'senten_len_avg']:
        feats.append( float(test[k].iloc[i])  )
    pred = float(model_content.predict( np.array([feats]) )[0])
    content_preds.append(pred)
    
    
wording_preds = []
for i in tqdm(range(len(test))):
    feats = []
    for k in ['coherence','syl_complexity', 'lex_complexity', 'gramm_complexity', 'diffic_words',
        'spelling_errors', 'grammar_errors', 'num_sentences', 'answer_len',
        'senten_len_max', 'senten_len_min', 'senten_len_avg']:
        feats.append( float(test[k].iloc[i])  )
    pred = float(model_wording.predict( np.array([feats]) )[0])
    wording_preds.append(pred)
    
    
test['content_preds2'] = content_preds
test['wording_preds2'] = wording_preds
"""

"\ncontent_preds = []\nfor i in tqdm(range(len(train))):\n    feats = []\n    for k in ['coherence','syl_complexity', 'lex_complexity', 'gramm_complexity', 'diffic_words',\n        'spelling_errors', 'grammar_errors', 'num_sentences',\n        'senten_len_max', 'senten_len_min', 'senten_len_avg']:\n        feats.append( float(train[k].iloc[i])  )\n    pred = float(model_content.predict( np.array([feats]) )[0])\n    content_preds.append(pred)\n    \n    \nwording_preds = []\nfor i in tqdm(range(len(train))):\n    feats = []\n    for k in ['coherence','syl_complexity', 'lex_complexity', 'gramm_complexity', 'diffic_words',\n        'spelling_errors', 'grammar_errors', 'num_sentences', 'answer_len',\n        'senten_len_max', 'senten_len_min', 'senten_len_avg']:\n        feats.append( float(train[k].iloc[i])  )\n    pred = float(model_wording.predict( np.array([feats]) )[0])\n    wording_preds.append(pred)\n    \n    \ntrain['content_preds2'] = content_preds\ntrain['wording_preds2'] = word

In [52]:
#train['wording2'] = df_train_s['wording'].values

In [53]:
test

Unnamed: 0,student_id,prompt_id,text,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,...,wording_pred_3,wording,syl_complexity,lex_complexity,gramm_complexity,grammar_errors,diffic_words,spelling_errors,self_bleu,self_novelty
0,000000ffffff,abc123,Example text 1,3,Example text 1,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.385726,-1.402624,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0
1,111111eeeeee,def789,Example text 2,3,Example text 2,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.389704,-1.408819,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0
2,222222cccccc,abc123,Example text 3,3,Example text 3,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.390533,-1.409621,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0
3,333333dddddd,def789,Example text 4,3,Example text 4,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.403584,-1.419664,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0


In [54]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text"
               ] + targets

In [55]:
model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

        params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.048, #0.048
            'max_depth': 5,  #5
            'lambda_l1': 0.0,
            'lambda_l2': 0.011 #0.011
        }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                               lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        models.append(model)
    
    model_dict[target] = models

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3285
[LightGBM] [Info] Number of data points in the train set: 5108, number of used features: 19
[LightGBM] [Info] Start training from score 0.017606
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[45]	train's rmse: 0.413402
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3172
[LightGBM] [Info] Number of data points in the train set: 5156, number of used features: 19
[LightGBM] [Info] Start training from score -0.039959
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.476346
Early stopping, best iteration is:
[124]	train's rmse: 0.474646
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3201
[LightGBM] [Info] Number of data points in the train set: 5169, number of used features: 19

# CV Score

In [56]:
model_dict

{'content': [<lightgbm.basic.Booster at 0x7d6a7dc39f00>,
  <lightgbm.basic.Booster at 0x7d6a7dc38a60>,
  <lightgbm.basic.Booster at 0x7d6a7dc3a860>,
  <lightgbm.basic.Booster at 0x7d6a7dc3a6e0>],
 'wording': [<lightgbm.basic.Booster at 0x7d6a7dc39ae0>,
  <lightgbm.basic.Booster at 0x7d6a7dc3a740>,
  <lightgbm.basic.Booster at 0x7d6a7dc38430>,
  <lightgbm.basic.Booster at 0x7d6a7dc38550>]}

In [57]:
train.head(3)

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,...,content_pred,wording_pred,syl_complexity,lex_complexity,gramm_complexity,grammar_errors,diffic_words,spelling_errors,self_bleu,self_novelty
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,...,-0.03002,0.686377,18.02,0.28125,0.15625,4.0,11.0,3.0,0.459545,0.803279
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,...,-0.722237,-0.250455,61.0,0.145455,0.290909,4.0,3.0,0.0,0.264532,0.673077
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,"In Egypt, there were many occupations and soci...",32,"In complete sentences, summarize the structure...",Egyptian Social Structure,...,2.43931,1.904997,0.0,0.152727,0.210909,13.0,38.0,5.0,0.644615,0.5875


In [58]:
test.head(3)

Unnamed: 0,student_id,prompt_id,text,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,...,wording_pred_3,wording,syl_complexity,lex_complexity,gramm_complexity,grammar_errors,diffic_words,spelling_errors,self_bleu,self_novelty
0,000000ffffff,abc123,Example text 1,3,Example text 1,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.385726,-1.402624,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0
1,111111eeeeee,def789,Example text 2,3,Example text 2,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.389704,-1.408819,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0
2,222222cccccc,abc123,Example text 3,3,Example text 3,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.390533,-1.409621,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0


In [59]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

content_rmse : 0.45061900053973186
wording_rmse : 0.5746319145023806
mcrmse : 0.5126254575210563


In [60]:
"""
content_rmse : 0.44722283732964696
wording_rmse : 0.5685644558649501
mcrmse : 0.5078936465972985
"""

'\ncontent_rmse : 0.44722283732964696\nwording_rmse : 0.5685644558649501\nmcrmse : 0.5078936465972985\n'

# Predict

In [61]:
drop_columns = [
                #"fold", 
                "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text",
                "input"
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [62]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [63]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)


















# Create a submission file

In [64]:
test

Unnamed: 0,student_id,prompt_id,text,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,...,wording_pred_3,wording,syl_complexity,lex_complexity,gramm_complexity,grammar_errors,diffic_words,spelling_errors,self_bleu,self_novelty
0,000000ffffff,abc123,Example text 1,3,Example text 1,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.30346,-1.274935,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0
1,111111eeeeee,def789,Example text 2,3,Example text 2,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.30346,-1.274935,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0
2,222222cccccc,abc123,Example text 3,3,Example text 3,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.30346,-1.274935,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0
3,333333dddddd,def789,Example text 4,3,Example text 4,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.30346,-1.274935,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0


In [65]:
sample_submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.497897,-0.812235
1,111111eeeeee,-1.497897,-0.812235
2,222222cccccc,-1.497897,-0.812235
3,333333dddddd,-1.497897,-0.812235


In [66]:
"""
for i in range(len(test)):
    test['wording'].iloc[i] = sample_submission['wording'].iloc[i]
"""

"\nfor i in range(len(test)):\n    test['wording'].iloc[i] = sample_submission['wording'].iloc[i]\n"

In [67]:
test

Unnamed: 0,student_id,prompt_id,text,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,...,wording_pred_3,wording,syl_complexity,lex_complexity,gramm_complexity,grammar_errors,diffic_words,spelling_errors,self_bleu,self_novelty
0,000000ffffff,abc123,Example text 1,3,Example text 1,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.30346,-1.274935,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0
1,111111eeeeee,def789,Example text 2,3,Example text 2,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.30346,-1.274935,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0
2,222222cccccc,abc123,Example text 3,3,Example text 3,0,Summarize...,Example Title 1,Heading\nText...,3,...,-1.30346,-1.274935,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0
3,333333dddddd,def789,Example text 4,3,Example text 4,0,Summarize...,Example Title 2,Heading\nText...,3,...,-1.30346,-1.274935,0,0.333333,0.0,1.0,1.0,0.0,-10,1.0


In [68]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False) 
display(pd.read_csv('submission.csv'))

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.418118,-1.274935
1,111111eeeeee,-1.418118,-1.274935
2,222222cccccc,-1.418118,-1.274935
3,333333dddddd,-1.418118,-1.274935
