In [1]:
# !pip install -q bitsandbytes-cuda110

In [2]:
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler
import random
from transformers import AdamW, AutoTokenizer, AutoModel, DataCollatorWithPadding, AutoConfig
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch.nn import Parameter
import math
from torch.optim import Adam, lr_scheduler
from sklearn.model_selection import KFold
import urllib.request
from typing import List
from functools import partial
from sklearn.model_selection import StratifiedShuffleSplit, GroupKFold, KFold
from torch.cuda.amp import autocast, GradScaler
from transformers.models.deberta.modeling_deberta import ContextPooler
from nltk.corpus import stopwords
import string

import torchmetrics
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [3]:
df = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
test = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')

In [4]:
df

In [5]:
test

In [6]:
seed_num = 22
random.seed(seed_num)
np.random.seed(seed_num)
torch.manual_seed(seed_num)
torch.cuda.manual_seed_all(seed_num)
accumulation_steps = 1

In [7]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

In [8]:
TRAIN_DIR = "../input/feedback-prize-effectiveness/train"
TEST_DIR = "../input/feedback-prize-effectiveness/test"

In [9]:
def get_essay(essay_id):
    essay_path = os.path.join(TRAIN_DIR, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text

In [10]:
df['essay_text'] = df['essay_id'].apply(get_essay)

In [11]:
def remove_punctuations(text):
    for punctuation in list(string.punctuation):
        text = text.replace(punctuation, '')
    return text

In [12]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [13]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [14]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [15]:
def decontraction(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [16]:
# df['essay_text'] = df['essay_text'].apply(remove_punctuations)
# df['discourse_text'] = df['discourse_text'].apply(remove_punctuations)
# df['essay_text'] = df['essay_text'].apply(lambda x: str(x).lower())
# df['essay_text'] = df['essay_text'].apply(lambda x: re.sub('\s+',  ' ', x))
# df['essay_text'] = df['essay_text'].apply(lambda x: decontraction(x))
# df['discourse_text'] = df['discourse_text'].apply(lambda x: str(x).lower())
# df['discourse_text'] = df['discourse_text'].apply(lambda x: re.sub('\s+',  ' ', x))
# df['discourse_text'] = df['discourse_text'].apply(lambda x: decontraction(x))

In [17]:
df

In [18]:
gkf = GroupKFold(n_splits=5)

In [19]:
# tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

In [20]:
# tokenizer.encode('hi good asdf wer qweasd', 'hi',truncation=True, max_length=6, padding='max_length', truncation_strategy='only_first')

In [21]:
# class FeedBackDataset(Dataset):
#     def __init__(self, df, tokenizer, max_length):
#         self.df = df
#         self.max_len = max_length
#         self.tokenizer = tokenizer
#         self.discourse = df['discourse_text'].values
#         self.essay = df['essay_text'].values
#         self.targets = df['discourse_effectiveness'].values
        
#     def __len__(self):
#         return len(self.df)
    
#     def __getitem__(self, index):
#         discourse = self.discourse[index]
#         essay = self.essay[index]
#         text = discourse + " " + self.tokenizer.sep_token + " " + essay
#         inputs = self.tokenizer.encode_plus(
#                         text,
#                         truncation=True,
#                         add_special_tokens=True,
#                         max_length=self.max_len
#                     )
        
#         return {
#             'input_ids': inputs['input_ids'],
#             'attention_mask': inputs['attention_mask'],
#             'target': self.targets[index]
#         }

In [22]:
class FeedbackDataset(Dataset):
    def __init__(self, data, is_train=True):
        super().__init__()
        self.max_length = 512
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("../input/deberta-v3-base/deberta-v3-base", use_fast=True)
        self.is_train = is_train
    
    def labeling(self, label):
        new_label = {"Ineffective": 0, "Adequate": 1, "Effective": 2}
        return new_label[label]
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.is_train:
            sentence1, sentence2, label = self.data['discourse_type'][idx], self.data['discourse_text'][idx], self.data['discourse_effectiveness'][idx]
            essay = self.data['essay_text'][idx]
            label = self.labeling(label)
            label = torch.LongTensor([label])
        else:
            sentence1, sentence2 = self.data['discourse_type'][idx], self.data['discourse_text'][idx]
            essay = self.data['essay_text'][idx]
        text = sentence1 + " " + self.tokenizer.sep_token + " " + sentence2 + " " +self.tokenizer.sep_token + " "+essay
        input_ids = self.tokenizer.encode(text, truncation=True, max_length=self.max_length, padding=True)
        input_ids = torch.LongTensor(input_ids)
        if self.is_train:
            return {
                'input_ids': input_ids,
                'label': label
            }
        else:
            return {
                'input_ids': input_ids
            }

In [23]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [24]:
class FeedbackModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_model = AutoModel.from_pretrained("../input/deberta-v3-base/deberta-v3-base")
        self.fc = nn.Linear(self.bert_model.config.hidden_size, 3)
        self.bn = nn.BatchNorm1d(3)
        self.dropout = nn.Dropout(0.3)
#         self.pooler = ContextPooler(AutoConfig.from_pretrained("../input/deberta-v3-base/deberta-v3-base"))
        self.pooler = MeanPooling()

#         self._init_params()
        
        
    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
        
    
    def forward(self, input_ids):
        attention_mask = (input_ids!=0).long()
        x = self.bert_model(input_ids, attention_mask=attention_mask, output_hidden_states=False)
        x = self.pooler(x.last_hidden_state, attention_mask)
#         x = self.pooler(x.last_hidden_state)
        output = self.fc(self.dropout(x))
#         output = self.fc(x)
        return output

In [25]:
# train['discourse_effectiveness'].value_counts()

In [26]:
# sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed_num)

In [27]:
# indices = list(range(len(df)))
# train_idx, valid_idx = next(sss.split(indices, df['discourse_effectiveness'].tolist()))

In [28]:
# train_df = df.iloc[train_idx]
# valid_df = df.iloc[valid_idx]
# train_df.reset_index(inplace=True, drop=True)
# valid_df.reset_index(inplace=True, drop=True)

In [29]:
# train_ds = FeedbackDataset(train_df)
# valid_ds = FeedbackDataset(valid_df)

In [30]:
batch_size = 8
epochs = 3
# train_dataLoader = DataLoader(train_ds, batch_size=batch_size)
# valid_dataLoader = DataLoader(valid_ds, batch_size=batch_size)

In [31]:
def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [32]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, logits=False, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduce = reduce
        self.criterion = nn.CrossEntropyLoss(reduction='none')

    def forward(self, inputs, targets):
    
        ce_loss = self.criterion(inputs, targets)

        pt = torch.exp(-ce_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * ce_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [33]:
criterion2 = nn.CrossEntropyLoss()
criterion = FocalLoss()
# criterion = nn.CrossEntropyLoss()

train_acc = torchmetrics.Accuracy()
def cal_accuracy(X,Y):
    predict_scores = F.softmax(X, dim=1)
    predict_labels = torch.argmax(predict_scores, dim=-1)
    acc = train_acc(predict_labels.to('cpu'), y.cpu())
    return acc

In [34]:
train_dataset = FeedbackDataset(df)

In [35]:
collate_fn = DataCollatorWithPadding(tokenizer=AutoTokenizer.from_pretrained("../input/deberta-v3-base/deberta-v3-base", use_fast=True))

In [None]:
for fold,(train_idx,valid_idx) in enumerate(gkf.split(train_dataset, groups=df.essay_id)):
#     if fold<=1:
#         continue
    cnt = 0
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_idx)
    train_dataLoader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler,  collate_fn=collate_fn, 
                              num_workers=2, pin_memory=True, drop_last=True)
    valid_dataLoader = DataLoader(train_dataset, batch_size=batch_size*2, sampler=valid_subsampler,  collate_fn=collate_fn, 
                              num_workers=2, pin_memory=True, drop_last=True)
    best_acc = 0
    best_loss = 10
    model = FeedbackModel().to(device)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
#     optimizer = AdamW(optimizer_grouped_parameters, betas=(0.9, 0.98), lr=1e-5, eps=1e-8)
    num_train_optimization_steps = int(epochs * len(train_dataLoader) / accumulation_steps)
#     optimizer = AdamW(model.parameters(), lr=2e-5)
    optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-6)
#     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
#                                                 num_training_steps=num_train_optimization_steps)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=500, 
                                                   eta_min=1e-6)
#     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
#                                                     num_training_steps=len(train_dataLoader) * epochs)
    model.zero_grad()
    print(f'------------fold no---------{fold + 1}----------------------')
    scaler = GradScaler()
    for epoch_i in range(0, epochs):
        model.train()
        total_loss = 0
        train_accuracy = 0
        nb_train_steps = 0
        dataset_size = 0
        running_loss = 0.0
        bar = tqdm(enumerate(train_dataLoader), total=len(train_dataLoader))
        for step, batch in bar:
#             batch = tuple(t.to(device) for t in batch)
#             input_ids, label = batch
            input_ids = batch['input_ids'].to(device)
            label = batch['labels'].to(device)
            with torch.cuda.amp.autocast():
                outputs = model(input_ids)
                y = label.view(-1)
                loss = criterion(outputs, y)
            total_loss += loss.item()
#             loss.backward()
            scaler.scale(loss).backward()
            if step % accumulation_steps == 0 or step == len(bar) - 1:
#                 scaler.unscale_(optimizer)
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()
#             optimizer.step()
#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad()
#             scheduler.step()
            logits = outputs
            tmp_train_accuracy = cal_accuracy(logits, label.to('cpu').numpy())
            train_accuracy += tmp_train_accuracy
            nb_train_steps += 1
            running_loss += (loss.item() * batch_size)
            dataset_size += batch_size
            epoch_loss = running_loss / dataset_size
            bar.set_postfix(Epoch=epoch_i, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
#         avg_train_loss = total_loss / len(train_dataLoader)
#         print('')
#         print(epoch_i + 1, f'  Average training loss: {avg_train_loss:.4f}')
#         print(f'  Accuracy: {train_accuracy/(nb_train_steps):.4f}')
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        valid_loss = 0
        dataset_size = 0
        running_loss = 0.0
        bar = tqdm(enumerate(valid_dataLoader), total=len(valid_dataLoader))
        for step, batch in bar:
            input_ids = batch['input_ids'].to(device)
            label = batch['labels'].to(device)
#             batch = tuple(t.to(device) for t in batch)
#             input_ids, label = batch
            with torch.no_grad():     
                outputs = model(input_ids)
            y = label.view(-1)
            loss = criterion2(outputs, y)
            valid_loss += loss.item()
            logits = outputs
            tmp_eval_accuracy = cal_accuracy(logits, label.to('cpu').numpy())
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
            running_loss += (loss.item() * batch_size * 2)
            dataset_size += batch_size * 2
            epoch_loss = running_loss / dataset_size
            bar.set_postfix(Epoch=epoch_i, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
        avg_valid_loss = valid_loss / len(valid_dataLoader)
        valid_accuracy = eval_accuracy/(nb_eval_steps)
        if best_loss > avg_valid_loss:
            cnt=0
            best_loss = avg_valid_loss
            torch.save(
            {
                "model": "FeedbackModel",
                "model_state_dict": model.state_dict(),
                "description": f"FeedbackModel 체크포인트-{fold + 1}",
            },
            f"/kaggle/working/checkpoint-{fold + 1}.pt",
        )
            print(f'model{fold + 1} saved')
        else:
            cnt+=1
            if cnt==2:
                print(f'early stop {fold+1} fold, {epoch_i} epcoh_i')
                print(epoch_i + 1, f'  Average valid loss: {avg_valid_loss:.4f}')
                print(f'  Accuracy: {valid_accuracy:.4f}')
                break
        print(epoch_i + 1, f'  Average valid loss: {avg_valid_loss:.4f}')
        print(f'  Accuracy: {valid_accuracy:.4f}')

In [None]:
batch

In [None]:
# best_loss = 10
# best_acc = 0
# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
# optimizer = AdamW(optimizer_grouped_parameters, betas=(0.9, 0.98), lr=2e-5, eps=1e-8)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
#                                                 num_training_steps=len(train_dataLoader) * epochs)
# model.zero_grad()
# for epoch_i in range(0, epochs):
#     total_loss = 0
#     total_acc = 0
#     nb_train_steps = 0
#     train_accuracy = 0
#     model.train()
#     for batch in tqdm(train_dataLoader):
#         batch = tuple(t.to(device) for t in batch)
#         input_ids, label = batch
#         outputs = model(input_ids)
#         y = label.view(-1)
#         loss = criterion(outputs, y)
#         total_loss += loss.item()
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         optimizer.step()
#         optimizer.zero_grad()
#         scheduler.step()
#         logits = outputs
#         tmp_train_accuracy = cal_accuracy(logits, label.cpu().numpy())
#         train_accuracy += tmp_train_accuracy
#         nb_train_steps += 1
#         if nb_train_steps % 100 == 0 and not nb_train_steps == 0:
#             print('step : {:>5,} of {:>5,} loss: {:.5f}'.format(nb_train_steps, len(train_dataLoader), loss.item()))
#     avg_train_loss = total_loss / len(train_dataLoader)
#     print('')
#     print(epoch_i + 1, f'  Average training loss: {avg_train_loss:.4f}')
#     print(f'  Accuracy: {train_accuracy/(nb_train_steps):.4f}')
#     model.eval()
#     eval_loss, eval_accuracy = 0, 0
#     nb_eval_steps, nb_eval_examples = 0, 0
#     valid_loss = 0
#     for batch in tqdm(valid_dataLoader):
#         batch = tuple(t.to(device) for t in batch)
#         input_ids, label = batch
#         with torch.no_grad():     
#             outputs = model(input_ids)
#         y = label.view(-1)
#         loss = criterion(outputs, y)
#         valid_loss += loss.item()
#         logits = outputs
#         tmp_eval_accuracy = cal_accuracy(logits, label.cpu().numpy())
#         eval_accuracy += tmp_eval_accuracy
#         nb_eval_steps += 1
#     avg_valid_loss = valid_loss / len(valid_dataLoader)
#     valid_accuracy = eval_accuracy/(nb_eval_steps)
#     if best_acc < valid_accuracy:
#         best_acc = valid_accuracy
#         torch.save(
#             {
#                 "model": "FeedbackModel",
#                 "model_state_dict": model.state_dict(),
#                 "description": f"FeedbackModel 체크포인트-{epoch_i}",
#             },
#             f"/kaggle/working/checkpoint-{epoch_i}.pt",
#         )
#     print(epoch_i + 1, f'  Average valid loss: {avg_valid_loss:.4f}')
#     print(f'  Accuracy: {valid_accuracy:.4f}')