In [1]:
import random
import os
import warnings
warnings.filterwarnings('ignore')
from dataclasses import dataclass

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score, accuracy_score

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AddedToken,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

from datasets import Dataset, DatasetDict

seed = 42

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

2024-06-09 00:50:39.147122: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-09 00:50:39.147244: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-09 00:50:39.246155: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
@dataclass
class Config:
    i_want_to_train = True
    i_want_to_test = True
    
    # Local
    _checkpoints = ['google-bert/bert-base-uncased', # 0
                   'google-bert/bert-base-cased', # 1
                   'microsoft/deberta-v3-large', # 2
                   'microsoft/deberta-v3-base', # 3
                   'microsoft/deberta-v3-small', # 4
                   'microsoft/deberta-v3-xsmall'] # 5
    _c = 3
    checkpoint = '/kaggle/input/init-aes2/' + _checkpoints[_c].replace('/', '__')
    
    # Online
    # checkpoint = _checkpoints[_c]
    
    checkpoint_ = '/kaggle/working/' + _checkpoints[_c].split('/')[1]
    train_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv'
    test_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv'
    
    max_length = 1024
    n_splits = 5
    
    num_train_epochs = 3
    learning_rate = 1e-5
    warmup_ratio = 0.0
    per_device_train_batch_size = 4
    per_device_eval_batch_size = 4
    weight_decay = 0.01
    lr_scheduler_type = 'linear'
    strategy = 'epoch' # 'steps'
    logging_steps = 100


cfg = Config()

In [3]:
import re

def removeHTML(text):
    html = re.compile(r'<.*?>')
    return html.sub(r' ',text)

# làm sạch văn bản
def text_cleaning(text):
    # Xóa những nơi có nhiều dấu chấm và phẩy và thay bằng 1 kí tự của chính nó
    text = re.sub(r"\.+", ".", text)
    text = re.sub(r"\,+", ",", text)
    text = text.replace("\'\'", '"')
    text = text.replace("\\xa0", ' ')
    text = text.strip()
    return text

# Hàm thực thi xử lí văn bản
def preprocessor(text):
    text = removeHTML(text)
    text = text_cleaning(text)
    return text

In [4]:
def data_preprocessing(path, tokenizer, max_length):
    data = pd.read_csv(path)
    data['label'] = data['score'].map(lambda x: x - 1)
    data["label"] = data["label"].astype("float32")
    data['full_text'] = data['full_text'].apply(preprocessor)
    dataset = Dataset.from_pandas(data)
    dataset = dataset.map(lambda x: tokenizer(x['full_text'], truncation = True, max_length = max_length), batched = True)

    # Remove unnecessary columns
    columns_to_remove = ['essay_id', 'full_text', 'score']
    dataset = dataset.remove_columns(columns_to_remove)
    
    return dataset

def data_preprocessing_test(path, tokenizer, max_length):
    data = pd.read_csv(path)
    data['full_text'] = data['full_text'].apply(preprocessor)
    dataset = Dataset.from_pandas(data)
    dataset = dataset.map(lambda x: tokenizer(x['full_text'], truncation = True, max_length = max_length), batched = True)
    
    # Remove unnecessary columns
    columns_to_remove = ['essay_id', 'full_text']
    dataset = dataset.remove_columns(columns_to_remove)
    return dataset, data

def evaluate_fn(eval_pred):
    labels = eval_pred.label_ids
    pred = eval_pred.predictions
    
    predictions = pred.clip(0, 5).round(0)

    return {"quad_kappa": cohen_kappa_score(y1=labels, y2=predictions, weights="quadratic")}


In [5]:
tokenizer = AutoTokenizer.from_pretrained(cfg.checkpoint)
tokenizer.add_tokens([AddedToken('\n', normalized = False)])
tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])
data_collator = DataCollatorWithPadding(tokenizer)

def train():
    dataset = data_preprocessing(cfg.train_path, tokenizer, cfg.max_length)
    kfold = StratifiedKFold(n_splits=cfg.n_splits, shuffle=True, random_state=42)
    splits = kfold.split(np.zeros(dataset.num_rows), dataset['label'])
    
    for fold, (train_idx, test_idx) in enumerate(splits):
        train_dataset = dataset.select(train_idx)
        eval_dataset = dataset.select(test_idx)
        model = AutoModelForSequenceClassification.from_pretrained(cfg.checkpoint,num_labels=1,
                                            hidden_dropout_prob = 0,
                                            attention_probs_dropout_prob = 0,
                                            cache_dir='./cache')
        
        training_args = TrainingArguments(
            output_dir = f'results',
            learning_rate = cfg.learning_rate,
            warmup_ratio = cfg.warmup_ratio,
            num_train_epochs = cfg.num_train_epochs,
            per_device_train_batch_size = cfg.per_device_train_batch_size,
            per_device_eval_batch_size = cfg.per_device_eval_batch_size,
            fp16 = True,
            lr_scheduler_type = cfg.lr_scheduler_type,
            weight_decay = cfg.weight_decay,
            logging_dir = 'logs/fold_{fold}',
            logging_steps = cfg.logging_steps,
            eval_strategy = cfg.strategy,
            save_strategy = cfg.strategy,
            load_best_model_at_end = True,
            report_to = 'none',
            optim='adamw_torch'
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            train_dataset = train_dataset,
            eval_dataset = eval_dataset,
            tokenizer = tokenizer,
            data_collator = data_collator,
            compute_metrics = evaluate_fn,
        )
        print("Training fold ", fold, "...")
        trainer.train()
        trainer.save_model(cfg.checkpoint_ + f'fold_{fold}')
        tokenizer.save_pretrained(cfg.checkpoint_ + f'fold_{fold}')
        

In [6]:
def test():
    all_preds = []
    for n in range(cfg.n_splits):
        model = AutoModelForSequenceClassification.from_pretrained(cfg.checkpoint_ + f'fold_{n}', local_files_only = True)
        tokenizer = AutoTokenizer.from_pretrained(cfg.checkpoint_ + f'fold_{n}', local_files_only = True)
        data_collator = DataCollatorWithPadding(tokenizer)

        dataset, df = data_preprocessing_test(cfg.test_path, tokenizer, cfg.max_length)

        trainer = Trainer(
            model = model,
            tokenizer = tokenizer,
            data_collator = data_collator,
        )

        fold_preds = trainer.predict(dataset).predictions
        all_preds.append(fold_preds)

    preds = np.mean(all_preds, axis = 0).clip(0, 5).round(0).reshape(-1,) + 1
    
    submission = pd.DataFrame({
    'essay_id': df['essay_id'].values,
    'score': preds.astype('int')
    })
    
    display(submission)
    submission.to_csv('submission.csv', index = False)

In [7]:
if cfg.i_want_to_train:
    torch.cuda.empty_cache()
    train()

Map:   0%|          | 0/17307 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/init-aes2/microsoft__deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training fold  0 ...


Epoch,Training Loss,Validation Loss,Quad Kappa
1,0.3642,0.31913,0.77534
2,0.2746,0.292317,0.805738
3,0.2219,0.27324,0.833589


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/init-aes2/microsoft__deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training fold  1 ...


Epoch,Training Loss,Validation Loss,Quad Kappa
1,0.4086,0.372352,0.784291
2,0.2515,0.272572,0.833029
3,0.1959,0.265628,0.840505


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/init-aes2/microsoft__deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training fold  2 ...


Epoch,Training Loss,Validation Loss,Quad Kappa
1,0.3291,0.341107,0.784351
2,0.2964,0.313927,0.798267
3,0.2376,0.284492,0.823459


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/init-aes2/microsoft__deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training fold  3 ...


Epoch,Training Loss,Validation Loss,Quad Kappa
1,0.3321,0.359282,0.7759
2,0.2974,0.294791,0.819457
3,0.2112,0.283701,0.826127


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/init-aes2/microsoft__deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training fold  4 ...


Epoch,Training Loss,Validation Loss,Quad Kappa
1,0.3537,0.472789,0.70483
2,0.2906,0.288312,0.822021
3,0.1897,0.280221,0.829067


In [8]:
if cfg.i_want_to_test:
    test()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Unnamed: 0,essay_id,score
0,000d118,2
1,000fe60,3
2,001ab80,4
