In [None]:
# Required imports
import json
import os
import torch
import torch.nn as nn
from tqdm.notebook import trange, tqdm
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
from sklearn.metrics import mean_squared_error

# Constants
PADDING_TOKEN = 1
S_OPEN_TOKEN = 0
S_CLOSE_TOKEN = 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device: ', device)

device:  cuda


In [None]:
# Utility functions
def jsonload(fname, encoding="utf-8"):
    with open(fname, encoding=encoding) as f:
        j = json.load(f)
    return j

def jsondump(j, fname):
    with open(fname, "w", encoding="UTF8") as f:
        json.dump(j, f, ensure_ascii=False)

def jsonlload(fname, encoding="utf-8"):
    json_list = []
    with open(fname, encoding=encoding) as f:
        for line in f.readlines():
            json_list.append(json.loads(line))
    return json_list

def jsonldump(j_list, fname):
    f = open(fname, "w", encoding='utf-8')
    for json_data in j_list:
        f.write(json.dumps(json_data, ensure_ascii=False)+'\n')

# Argument settings
class Args:
    train_data = "nikluge-2022-nli-train.jsonl"
    test_data = "nikluge-2022-nli-test.jsonl"
    pred_data = "result.jsonl"
    dev_data = "nikluge-2022-nli-dev.jsonl"
    batch_size = 8
    learning_rate = 3e-5
    eps = 1e-8
    do_train = True
    do_eval = True
    do_test = True
    num_train_epochs = 20
    base_model = "lighthouse/mdeberta-v3-base-kor-further"
    model_path = "saved_models/"
    output_dir = "output/"
    do_demo = False
    max_len = 256
    classifier_hidden_size = 768
    classifier_dropout_prob = 0.1

args = Args()

# Model definitions
class SimpleRegression(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.dense = nn.Linear(args.classifier_hidden_size, args.classifier_hidden_size)
        self.dropout = nn.Dropout(args.classifier_dropout_prob)
        self.output = nn.Linear(args.classifier_hidden_size, 1)

    def forward(self, features):
        x = features[:, 0, :]
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.output(x)
        return x

class NaturalLanguageInference(nn.Module):
    def __init__(self, args, len_tokenizer):
        super(NaturalLanguageInference, self).__init__()
        self.xlm_roberta = AutoModel.from_pretrained(args.base_model)
        self.xlm_roberta.resize_token_embeddings(len_tokenizer)
        self.FFRegression = SimpleRegression(args)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.xlm_roberta(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=None)
        sequence_output = outputs[0]
        logits = self.FFRegression(sequence_output).view(-1)
        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(logits, labels.view(-1))
        return loss, logits

def tokenize_and_align_labels(tokenizer, form, value, max_len):
    data_dict = {
        'input_ids': [],
        'attention_mask': [],
        'value': [],
    }
    tokenized_data = tokenizer(form['context'], form['prompt'], padding='max_length', max_length=max_len, truncation=True)
    data_dict['input_ids'].append(tokenized_data['input_ids'])
    data_dict['attention_mask'].append(tokenized_data['attention_mask'])
    data_dict['value'].append(float(value))
    return data_dict

def get_dataset(raw_data, tokenizer, max_len):
    input_ids_list = []
    attention_mask_list = []
    token_labels_list = []

    for utterance in raw_data:
        tokenized_data = tokenize_and_align_labels(tokenizer, utterance['input'], utterance['output'], max_len)
        input_ids_list.extend(tokenized_data['input_ids'])
        attention_mask_list.extend(tokenized_data['attention_mask'])
        token_labels_list.extend(tokenized_data['value'])

    return TensorDataset(torch.tensor(input_ids_list), torch.tensor(attention_mask_list), torch.tensor(token_labels_list))

def evaluation(y_true, y_pred):
    y_true = list(map(float, y_true))
    y_pred = list(map(float, y_pred))
    for i in range(len(y_true)):
        if type(y_true[i]) == str:
            y_true[i] = float(y_true[i])
        if type(y_pred[i]) == str:
            y_pred[i] = float(y_pred[i])
    print('mean_squared_error: ', mean_squared_error(y_true, y_pred))

# Training function
def train_nli_expression_classifier(args):
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    print('train_nli_expression_classifier')
    print('model would be saved at ', args.model_path)
    print('loading train data')
    train_data = jsonlload(args.train_data)
    dev_data = jsonlload(args.dev_data)
    print('tokenizing train data')
    tokenizer = AutoTokenizer.from_pretrained(args.base_model)
    train_dataloader = DataLoader(get_dataset(train_data, tokenizer, args.max_len), shuffle=True, batch_size=args.batch_size)
    dev_dataloader = DataLoader(get_dataset(dev_data, tokenizer, args.max_len), shuffle=True, batch_size=args.batch_size)
    print('loading model')
    model = NaturalLanguageInference(args, len(tokenizer))
    model.to(device)
    FULL_FINETUNING = True
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.eps)
    epochs = args.num_train_epochs
    max_grad_norm = 1.0
    total_steps = epochs * len(train_dataloader)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    epoch_step = 0
    for _ in trange(epochs, desc="Epoch"):
        model.train()
        epoch_step += 1
        total_loss = 0
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            model.zero_grad()
            loss, _ = model(b_input_ids, b_input_mask, b_labels)
            loss.backward()
            total_loss += loss.item()
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_loss / len(train_dataloader)
        print("Epoch: ", epoch_step)
        print("Average train loss: {}".format(avg_train_loss))
        if args.do_eval:
            model.eval()
            pred_list = []
            label_list = []
            for batch in dev_dataloader:
                batch = tuple(t.to(device) for t in batch)
                b_input_ids, b_input_mask, b_labels = batch
                with torch.no_grad():
                    loss, logits = model(b_input_ids, b_input_mask, b_labels)
                predictions = logits
                pred_list.extend(predictions)
                label_list.extend(b_labels)
            evaluation(label_list, pred_list)
        if not os.path.exists(args.model_path):
            os.makedirs(args.model_path)
        model_saved_path = args.model_path + 'saved_model_epoch_' + str(epoch_step) + '.pt'
        torch.save(model.state_dict(), model_saved_path)
    print("training is done")

# Test function
def test_nli_expression_classifier(args):
    test_data = jsonlload(args.test_data)
    pred_data = jsonlload(args.pred_data)
    temp_ground_truth_dict = {}
    true_list = []
    pred_list = []

    for data in test_data:
        if 'output' not in data:
            return {"error": "정답 데이터에 'output' 키가 존재하지 않습니다"}
        if data['id'] in temp_ground_truth_dict:
            return {"error": "정답 데이터에 중복된 id를 가지는 경우 존재"}
        temp_ground_truth_dict[data['id']] = data['output']

    for data in pred_data:
        if 'output' not in data:
            return {"error": "제출 파일에 'output' 키가 존재하지 않습니다"}
        if data['id'] not in temp_ground_truth_dict:
            return {"error": "제출 파일과 정답 파일의 id가 일치하지 않음"}
        true_list.append(temp_ground_truth_dict[data['id']])
        pred_list.append(data['output'])

    evaluation(true_list, pred_list)

# Demo function
def demo_nli_expression_classifier(args):
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.base_model)
    test_data = jsonlload(args.test_data)
    model = NaturalLanguageInference(args, len(tokenizer))
    model.load_state_dict(torch.load(args.model_path, map_location=device))
    model.to(device)
    model.eval()
    for data in tqdm(test_data):
        tokenized_data = tokenizer(data['input']['context'], data['input']['prompt'], padding='max_length', max_length=args.max_len, truncation=True)
        input_ids = torch.tensor([tokenized_data['input_ids']]).to(device)
        attention_mask = torch.tensor([tokenized_data['attention_mask']]).to(device)
        with torch.no_grad():
            _, logits = model(input_ids, attention_mask)
        predictions = logits
        data['output'] = str(float(predictions[0]))
    jsonldump(test_data, args.output_dir + 'result.jsonl')


In [None]:
torch.cuda.empty_cache()

In [None]:
train_nli_expression_classifier(args)

train_nli_expression_classifier
model would be saved at  saved_models/
loading train data
tokenizing train data
loading model




Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch:  1
Average train loss: 4.633679839787562
mean_squared_error:  2.6790805190397715
Epoch:  2
Average train loss: 1.5862352455765503
mean_squared_error:  1.8182636820050235
Epoch:  3
Average train loss: 0.7216120903580887
mean_squared_error:  0.9453062079551405
Epoch:  4
Average train loss: 0.4332518442148003
mean_squared_error:  1.1793344079644579
Epoch:  5
Average train loss: 0.3398050110560754
mean_squared_error:  1.1780301193342821
Epoch:  6
Average train loss: 0.22545801807009713
mean_squared_error:  1.177315991252269
Epoch:  7
Average train loss: 0.1777924462761833
mean_squared_error:  1.0148907812573318
Epoch:  8
Average train loss: 0.13911540784682688
mean_squared_error:  1.4886860007663663
Epoch:  9
Average train loss: 0.10687497413578284
mean_squared_error:  0.9420891544773532
Epoch:  10
Average train loss: 0.08544087992950204
mean_squared_error:  0.891468267841909
Epoch:  11
Average train loss: 0.09211260849064555
mean_squared_error:  0.9550487336887206
Epoch:  12
Averag

In [None]:
class Args:
    train_data = "nikluge-2022-nli-train.jsonl"
    test_data = "nikluge-2022-nli-test.jsonl"
    pred_data = "result.jsonl"
    dev_data = "nikluge-2022-nli-dev.jsonl"
    batch_size = 8
    learning_rate = 3e-5
    eps = 1e-8
    do_train = False
    do_eval = False
    do_test = False
    num_train_epochs = 3
    base_model = "lighthouse/mdeberta-v3-base-kor-further"
    model_path = "/content/saved_models/saved_model_epoch_9.pt"
    output_dir = "output/"
    do_demo = True
    max_len = 256
    classifier_hidden_size = 768
    classifier_dropout_prob = 0.1

args = Args()

In [None]:
demo_nli_expression_classifier(args)

  0%|          | 0/180 [00:00<?, ?it/s]

In [None]:
class Args:
    test_data = "nikluge-2022-nli-test.jsonl"
    pred_data = "/content/output/result.jsonl"
    do_test = True
    base_model = "xlm-roberta-base"

args = Args()

In [None]:
test_nli_expression_classifier(args)

{'error': "정답 데이터에 'output' 키가 존재하지 않습니다"}