In [70]:
import argparse
import random
import pandas as pd
import os
import re
import torch
import transformers
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import torch.nn as nn
import wandb
import torchmetrics
from scipy.stats import pearsonr
import gc
from transformers import ElectraModel, ElectraTokenizer
from sklearn.model_selection import KFold
import numpy as np

torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
random.seed(42)



In [71]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, tokenizer, max_length, mode = 'train'):
        self.inputs = inputs
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_columns = ['sentence_1', 'sentence_2']
        self.mode = mode

    def __len__(self):
        return len(self.inputs) 
    
    def __getitem__(self, idx):
        t = self.inputs.iloc[idx] 

        text = '[SEP]'.join([t[col] for col in self.text_columns])
        output = self.tokenizer(text,
                                padding='max_length',
                                max_length=self.max_length,
                                truncation=True)

        datas = torch.tensor(output['input_ids'], dtype = torch.long)
        attn = torch.tensor(output['attention_mask'], dtype = torch.long)
        # type_ids = torch.tensor(output['token_type_ids'], dtype = torch.long)
        if self.mode == 'train':
            labels = t['label']
            output = {'input_ids' : datas,
                      'attention_mask' : attn,
                    #   'token_type_ids' : type_ids,
                      'labels' : labels}
            return output
        else:
            output = {'input_ids' : datas,
                      'attention_mask' : attn,
                    #   'token_type_ids' : type_ids
                    }
            return output
    

    


In [72]:
def compute_metrics(model_preds):
    preds, labels = model_preds
    preds = torch.tensor(preds, dtype = torch.float32).squeeze(-1)
    labels = torch.tensor(labels, dtype = torch.float32).squeeze(-1)
    pear = torchmetrics.PearsonCorrCoef()
    pearson = pear(preds, labels)
    return {'pearson' : pearson.item()}

In [73]:
def make_dataset(train_route = None, tokenizer = None, max_length = None, valid_route = None, test_route = None, k = None, mode = 'train'):
    if mode == 'train':
        # train_route, valid_route, tokenizer, max_length, mode = 'train'
        train_data = pd.read_csv(train_route)
        valid_data = pd.read_csv(valid_route)
        train_dataset = Dataset(train_data, tokenizer = tokenizer, max_length = max_length)
        valid_dataset = Dataset(valid_data, tokenizer = tokenizer, max_length = max_length)
        return train_dataset, valid_dataset
        
    if mode == 'k_fold_train':
        # train_route, k, tokenizer, max_length, mode = 'k_fold_train'
        df = pd.DataFrame()
        for r in train_route:
            df = pd.concat([df, pd.read_csv(r)])
    
        kfold = KFold(n_splits = k, shuffle = True, random_state = 42)
        train_list = []
        valid_list = []
        for train_idx, val_idx in kfold.split(df):
            train = df.iloc[train_idx]
            valid = df.iloc[val_idx]
            train_dataset = Dataset(train, tokenizer = tokenizer, max_length = max_length)
            valid_dataset = Dataset(valid, tokenizer = tokenizer, max_length = max_length)
            train_list.append(train_dataset)
            valid_list.append(valid_dataset)
        return train_list, valid_list
        
    if mode == 'test':
        # test_route, tokenizer, max_length, mode = 'test'
        test_data = pd.read_csv(test_route)
        test_dataset = Dataset(test_data, tokenizer = tokenizer, max_length = max_length, mode = 'test')
        return test_dataset
        

In [74]:
class MyModel(nn.Module):
    def __init__(self, model_name):
        super(MyModel, self).__init__()
        if model_name == "snunlp/KR-ELECTRA-discriminator":
            print('found KR-ELECTRA')
            self.model = ElectraModel.from_pretrained("snunlp/KR-ELECTRA-discriminator")
        else:
            self.model = transformers.AutoModel.from_pretrained(
                model_name,
                trust_remote_code=True
            )
            
        self.cnn_block1 = nn.Sequential(
            nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            nn.MaxPool1d(kernel_size=2)
        )
        
        self.cnn_block2 = nn.Sequential(
            nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.AdaptiveAvgPool1d(1)
        )
        
        self.output_layer = nn.Linear(128, 1)  
        self.loss_fn = nn.MSELoss()

    def forward(self, input_ids, attention_mask, 
                # token_type_ids,
                  labels = None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, 
                            # token_type_ids=token_type_ids
                            )
        
        output = output.last_hidden_state.permute(0, 2, 1)  
        cnn_output = self.cnn_block1(output)  # Shape: (B, 768, L) >  (B, 256, L) > (B, 128, L/2)
        
        cnn_output = self.cnn_block2(cnn_output)  # Shape: (B, 128, L/2) > (B, 128, L) > (B, 128, 1)
        cnn_output = cnn_output.view(cnn_output.size(0), -1)  # Shape: (B, 128)
        output = self.output_layer(cnn_output).squeeze(-1) # Shape: (B)
        
        if labels is not None:
            loss = self.loss_fn(output, labels.float())
            return {'output' : output, 'loss' : loss}
        else:  
            return {'output' : output}



In [75]:
def get_model(model_name):
    model = MyModel(model_name)
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    datacollator = DataCollatorWithPadding(tokenizer = tokenizer,padding = True, return_tensors = 'pt')
    return model, tokenizer, datacollator

In [76]:
def train(model_name, args, training_args, kfold = False):
    model, tokenizer, data_collator = get_model(model_name)
    if kfold:
        train_l, eval_l = make_dataset(train_route = args.data_routes['train_routes'], 
                                    k = args.kf,
                                    tokenizer = tokenizer,
                                    max_length = args.max_length,
                                    mode = 'k_fold_train')
                                    
        for i in range(args.kf):
            print(f'------fold : {i} --- model_name : {model_name}')
            trainer = Trainer(model =  model,
                                tokenizer = tokenizer,
                                args = training_args,
                                train_dataset = train_l[i],
                                eval_dataset = eval_l[i],
                                compute_metrics = compute_metrics,
                                data_collator = data_collator,
                                 )
            trainer.train()
        trainer.save_model(f'results/best_model_{model_name}')
        preds = test(trainer, args)
        return preds
    else:
        train_dataset, valid_dataset = make_dataset(train_route = args.data_routes['train_routes'][0],
                                                    valid_route = args.data_routes['valid_route'],
                                                    tokenizer = tokenizer,
                                                    max_length = args.max_length,
                                                    mode = 'train')
        trainer = Trainer(model =  model,
                          tokenizer = tokenizer,
                          args = training_args,
                          train_dataset = train_dataset,
                          eval_dataset = valid_dataset,
                          compute_metrics = compute_metrics,
                          data_collator = data_collator,
                                 )
        print(f'------- model_name : {model_name}')
        trainer.train()
        trainer.save_model(f'results/best_model_{model_name}')
        preds = test(trainer, args)
        return preds

            


In [77]:
def test(trainer, args):
    preds = pd.DataFrmae()
    
    test_dataset = make_dataset(test_route = args.test_route, 
                                tokenzier = trainer.tokenizer,
                                max_length = args.max_length, 
                                mode = 'test')
    
    preds = trainer.predict(test_dataset)
    return preds

In [None]:
model_list = [
              'klue/roberta-small'
            # "klue/roberta-large",
            # 'monologg/koelectra-base-v3-discriminator',
            # "snunlp/KR-ELECTRA-discriminator",
            # 'Alibaba-NLP/gte-multilingual-base',
            # 'klue/roberta-base',
            # 'snunlp/KR-SBERT-Medium-klueNLItriplet_PARpair-klueSTS',
            # 'klue/bert-base',
            ]
data_routes = {'train_routes' : ['/content/final_data.csv',],
               'valid_route' : '/content/text_dev.csv',
               'test_route' : '/content/test.csv'}

parser = argparse.ArgumentParser()
parser.add_argument('--default_model_name', default = 'klue/roberta-small')
parser.add_argument('--model_list', default = model_list, type = list)
parser.add_argument('--batch_size', default = 32, type = int)
parser.add_argument('--max_epoch', default = 3, type = int)
parser.add_argument('--max_length', default = 160, type = int)
parser.add_argument('--kf', default = 4, type = int)
parser.add_argument('--data_routes', default = data_routes, type = dict)
parser.add_argument('--weight_decay', default = 0.01, type = float)
parser.add_argument('--eval_strategy', default = 'epoch', type = str)
parser.add_argument('--save_strategy', default = 'epoch', type = str)
parser.add_argument('--logging_dir', default = './logs', type = str)
parser.add_argument('--logging_steps', default = 30, type = int)
parser.add_argument('--test_route', default = test_route, type = str)

args = parser.parse_args(args=[])

training_args = TrainingArguments(
    output_dir = f"./results/default",
    eval_strategy = args.eval_strategy,
    save_strategy = args.save_strategy,
    per_device_train_batch_size = args.batch_size,
    per_device_eval_batch_size = args.batch_size,
    num_train_epochs = args.max_epoch,
    weight_decay = args.weight_decay,
    logging_dir = args.logging_dir,
    logging_steps = args.logging_steps,
    report_to = "wandb",  
    run_name = "default",
    load_best_model_at_end = True,
    metric_for_best_model = 'pearson'

)



In [79]:
def start_wandb():
    !rm -rf /root/.cache/wandb
    !rm -rf /root/.config/wandb
    !rm -rf /root/.netrc
    os.environ["WANDB_API_KEY"] = "ea26fff0d932bc74bbfad9fd507b292c67444c02"
    wandb.init(project="yonruka")

In [None]:
start_wandb()
preds = {}
# for model_name in model_list:
#     preds[model_name] = train(model_name, args, training_args, kfold = True)
for model_name in model_list:
    preds[model_name] = train(model_name, args, training_args, kfold = False)


print(preds)