In [1]:
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler
import random
from transformers import AdamW, AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch.nn import Parameter
import math
from torch.optim import Adam
from sklearn.model_selection import KFold
import urllib.request
from typing import List
from functools import partial
from sklearn.model_selection import StratifiedShuffleSplit, GroupKFold, KFold


import torchmetrics
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [2]:
df = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
test = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')

In [3]:
test

In [4]:
seed_num = 22
random.seed(seed_num)
np.random.seed(seed_num)
torch.manual_seed(seed_num)
torch.cuda.manual_seed_all(seed_num)

In [5]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

In [6]:
gkf = GroupKFold(n_splits=5)

In [7]:
# tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

In [8]:
# tokenizer.encode('hi good asdf wer qweasd', 'hi',truncation=True, max_length=6, padding='max_length', truncation_strategy='only_first')

In [9]:
class FeedbackDataset(Dataset):
    def __init__(self, data, is_train=True):
        super().__init__()
        self.max_length = 512
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
        self.is_train = is_train
    
    def labeling(self, label):
        new_label = {"Ineffective": 0, "Adequate": 1, "Effective": 2}
        return new_label[label]
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.is_train:
            sentence1, sentence2, label = self.data['discourse_text'][idx], self.data['discourse_type'][idx], self.data['discourse_effectiveness'][idx]
            label = self.labeling(label)
            label = torch.LongTensor([label])
        else:
            sentence1, sentence2 = self.data['discourse_text'][idx], self.data['discourse_type'][idx]
        input_ids = self.tokenizer.encode(sentence1, sentence2, truncation=True, max_length=self.max_length, padding='max_length', truncation_strategy='only_first')
        input_ids = torch.LongTensor(input_ids)
        if self.is_train:
            return input_ids, label
        else:
            return input_ids

In [10]:
class FeedbackModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_model = AutoModel.from_pretrained("microsoft/deberta-v3-base")
        self.fc = nn.Linear(self.bert_model.config.hidden_size, 3)
        self.bn = nn.BatchNorm1d(3)
        self._init_params()
        
        
    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
        
    
    def forward(self, input_ids):
        attention_mask = (input_ids!=0).long()
        x = self.bert_model(input_ids, attention_mask=attention_mask)
        x = torch.sum(x.last_hidden_state * attention_mask.unsqueeze(-1), dim=1) / attention_mask.sum(dim=1, keepdims=True)
        output = self.bn(self.fc(x))
        return output

In [11]:
# train['discourse_effectiveness'].value_counts()

In [12]:
# sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed_num)

In [13]:
# indices = list(range(len(df)))
# train_idx, valid_idx = next(sss.split(indices, df['discourse_effectiveness'].tolist()))

In [14]:
# train_df = df.iloc[train_idx]
# valid_df = df.iloc[valid_idx]
# train_df.reset_index(inplace=True, drop=True)
# valid_df.reset_index(inplace=True, drop=True)

In [15]:
# train_ds = FeedbackDataset(train_df)
# valid_ds = FeedbackDataset(valid_df)

In [16]:
batch_size = 8
epochs = 3
# train_dataLoader = DataLoader(train_ds, batch_size=batch_size)
# valid_dataLoader = DataLoader(valid_ds, batch_size=batch_size)

In [17]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, logits=False, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduce = reduce
        self.criterion = nn.CrossEntropyLoss(reduction='none')

    def forward(self, inputs, targets):
    
        ce_loss = self.criterion(inputs, targets)

        pt = torch.exp(-ce_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * ce_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [18]:
# criterion = nn.CrossEntropyLoss()
criterion = FocalLoss()

train_acc = torchmetrics.Accuracy()
def cal_accuracy(X,Y):
    predict_scores = F.softmax(X, dim=1)
    predict_labels = torch.argmax(predict_scores, dim=-1)
    acc = train_acc(predict_labels.to('cpu'), y.cpu())
    return acc

In [19]:
train_dataset = FeedbackDataset(df)

In [20]:
for fold,(train_idx,valid_idx) in enumerate(gkf.split(train_dataset, groups=df.essay_id)):
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_idx)
    train_dataLoader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
    valid_dataLoader = DataLoader(train_dataset, batch_size=batch_size*2, sampler=valid_subsampler)
    best_acc = 0
    best_loss = 10
    model = FeedbackModel().to(device)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, betas=(0.9, 0.98), lr=2e-5, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                                    num_training_steps=len(train_dataLoader) * epochs)
    model.zero_grad()
    print(f'------------fold no---------{fold + 1}----------------------')
    for epoch_i in range(0, epochs):
        model.train()
        total_loss = 0
        train_accuracy = 0
        nb_train_steps = 0
        for batch in tqdm(train_dataLoader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, label = batch
            outputs = model(input_ids)
            y = label.view(-1)
            loss = criterion(outputs, y)
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            logits = outputs
            tmp_train_accuracy = cal_accuracy(logits, label.to('cpu').numpy())
            train_accuracy += tmp_train_accuracy
            nb_train_steps += 1
        avg_train_loss = total_loss / len(train_dataLoader)
        print('')
        print(epoch_i + 1, f'  Average training loss: {avg_train_loss:.4f}')
        print(f'  Accuracy: {train_accuracy/(nb_train_steps):.4f}')
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        valid_loss = 0
        for batch in tqdm(valid_dataLoader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, label = batch
            with torch.no_grad():     
                outputs = model(input_ids)
            y = label.view(-1)
            loss = criterion(outputs, y)
            valid_loss += loss.item()
            logits = outputs
            tmp_eval_accuracy = cal_accuracy(logits, label.to('cpu').numpy())
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
        avg_valid_loss = valid_loss / len(valid_dataLoader)
        valid_accuracy = eval_accuracy/(nb_eval_steps)
        if best_loss > avg_valid_loss:
            best_loss = avg_valid_loss
            torch.save(
            {
                "model": "FeedbackModel",
                "model_state_dict": model.state_dict(),
                "description": f"FeedbackModel 체크포인트-{fold + 1}",
            },
            f"/kaggle/working/checkpoint-{fold + 1}.pt",
        )
            print(f'model{fold + 1} saved')
        print(epoch_i + 1, f'  Average valid loss: {avg_valid_loss:.4f}')
        print(f'  Accuracy: {valid_accuracy:.4f}')

In [None]:
# best_loss = 10
# best_acc = 0
# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
# optimizer = AdamW(optimizer_grouped_parameters, betas=(0.9, 0.98), lr=2e-5, eps=1e-8)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
#                                                 num_training_steps=len(train_dataLoader) * epochs)
# model.zero_grad()
# for epoch_i in range(0, epochs):
#     total_loss = 0
#     total_acc = 0
#     nb_train_steps = 0
#     train_accuracy = 0
#     model.train()
#     for batch in tqdm(train_dataLoader):
#         batch = tuple(t.to(device) for t in batch)
#         input_ids, label = batch
#         outputs = model(input_ids)
#         y = label.view(-1)
#         loss = criterion(outputs, y)
#         total_loss += loss.item()
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         optimizer.step()
#         optimizer.zero_grad()
#         scheduler.step()
#         logits = outputs
#         tmp_train_accuracy = cal_accuracy(logits, label.cpu().numpy())
#         train_accuracy += tmp_train_accuracy
#         nb_train_steps += 1
#         if nb_train_steps % 100 == 0 and not nb_train_steps == 0:
#             print('step : {:>5,} of {:>5,} loss: {:.5f}'.format(nb_train_steps, len(train_dataLoader), loss.item()))
#     avg_train_loss = total_loss / len(train_dataLoader)
#     print('')
#     print(epoch_i + 1, f'  Average training loss: {avg_train_loss:.4f}')
#     print(f'  Accuracy: {train_accuracy/(nb_train_steps):.4f}')
#     model.eval()
#     eval_loss, eval_accuracy = 0, 0
#     nb_eval_steps, nb_eval_examples = 0, 0
#     valid_loss = 0
#     for batch in tqdm(valid_dataLoader):
#         batch = tuple(t.to(device) for t in batch)
#         input_ids, label = batch
#         with torch.no_grad():     
#             outputs = model(input_ids)
#         y = label.view(-1)
#         loss = criterion(outputs, y)
#         valid_loss += loss.item()
#         logits = outputs
#         tmp_eval_accuracy = cal_accuracy(logits, label.cpu().numpy())
#         eval_accuracy += tmp_eval_accuracy
#         nb_eval_steps += 1
#     avg_valid_loss = valid_loss / len(valid_dataLoader)
#     valid_accuracy = eval_accuracy/(nb_eval_steps)
#     if best_acc < valid_accuracy:
#         best_acc = valid_accuracy
#         torch.save(
#             {
#                 "model": "FeedbackModel",
#                 "model_state_dict": model.state_dict(),
#                 "description": f"FeedbackModel 체크포인트-{epoch_i}",
#             },
#             f"/kaggle/working/checkpoint-{epoch_i}.pt",
#         )
#     print(epoch_i + 1, f'  Average valid loss: {avg_valid_loss:.4f}')
#     print(f'  Accuracy: {valid_accuracy:.4f}')