# imports

In [None]:
import warnings
warnings.simplefilter('ignore')

import os
import gc
import copy
import time
import random

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 200)
from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig, get_linear_schedule_with_warmup

os.makedirs('models', exist_ok=True)
os.makedirs('oofs', exist_ok=True)
os.makedirs('preds', exist_ok=True)

# loading data

In [None]:
df_train = pd.read_json('Sohu2022_data/nlp_data/train.txt', lines=True)
df_test = pd.read_json('Sohu2022_data/nlp_data/test.txt', lines=True)

print(df_train.shape, df_test.shape)

In [None]:
train_data = list()

for idx, row in tqdm(df_train.iterrows()):
    for entity in row['entity']:
        di = dict()
        di['id'] = f'{row["id"]}_{entity}'
        di['text'] = f'实体: {entity} [SEP] ' + row['content']
        di['label'] = row['entity'][entity]
        train_data.append(di)
        
df_train = pd.DataFrame(train_data)

In [None]:
test_data = list()

for idx, row in tqdm(df_test.iterrows()):
    for entity in row['entity']:
        di = dict()
        di['id'] = f'{row["id"]}'
        di['text'] = f'实体: {entity} [SEP] ' + row['content']
        test_data.append(di)
        
df_test = pd.DataFrame(test_data)

In [None]:
df_train['label'] += 2
df_train.label.value_counts()

In [None]:
display(df_train['text'].apply(lambda x: len(x)).describe())
display(df_test['text'].apply(lambda x: len(x)).describe())

# config

In [None]:
class Config:
    def __init__(self):
        super(Config, self).__init__()

        self.SEED = 42
        self.MODEL_PATH = 'hfl/chinese-roberta-wwm-ext'
        self.NUM_CLASSES = df_train['label'].nunique()

        # data
        self.CLASSES_WEIGHTS = [] # weights   # or []
        self.TOKENIZER = AutoTokenizer.from_pretrained(self.MODEL_PATH)
        self.MAX_LENGTH = 512
        self.BATCH_SIZE = 8
        self.ACCUMULATION_STEPS = 1
        self.N_FOLDS = 5

        # model
        self.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.FULL_FINETUNING = True
        self.LR = 2e-5
        self.N_VALIDATE_DUR_TRAIN = 3
        self.N_WARMUP = 0
        self.SAVE_BEST_ONLY = True
        self.EPOCHS = 3
        self.USE_FGM = False
        
CONFIG = Config()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


np.random.seed(CONFIG.SEED)
seed_torch(seed=CONFIG.SEED)

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = CONFIG.DEVICE

# dataset

In [None]:
class SentiDataset(Dataset):
    def __init__(self, df, indices, set_type=None):
        super(SentiDataset, self).__init__()

        df = df.loc[indices]
        self.texts = df['text'].values.tolist()
        self.set_type = set_type
        if self.set_type != 'test':
            self.labels = df['label'].values.tolist()

        self.tokenizer = CONFIG.TOKENIZER
        self.max_length = CONFIG.MAX_LENGTH

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        tokenized = self.tokenizer.encode_plus(
            self.texts[index],
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        input_ids = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()

        if self.set_type != 'test':
            return {
                'input_ids': input_ids.long(),
                'attention_mask': attention_mask.long(),
                'labels': torch.tensor(self.labels[index], dtype=torch.long),
            }

        return {
            'input_ids': input_ids.long(),
            'attention_mask': attention_mask.long(),
        }

# model

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.bert = AutoModel.from_pretrained(CONFIG.MODEL_PATH)
        self.drop = nn.Dropout(p=0.2)
        self.out = nn.Linear(self.bert.config.hidden_size, CONFIG.NUM_CLASSES)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids,
                                     attention_mask=attention_mask, 
                                     return_dict=False)
        output = self.drop(pooled_output)
        return self.out(output)

# train functions

In [None]:
def val_fn(model, valid_dataloader, criterion):
    val_loss = 0
    corrects = 0
    model.eval()
    for step, batch in tqdm(enumerate(valid_dataloader),
                            total=len(valid_dataloader),
                            desc='validing'):
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            logits = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            loss = criterion(logits, b_labels)
            val_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            corrects += torch.sum(preds == b_labels)
    avg_val_loss = val_loss / len(valid_dataloader)
    avg_val_acc = corrects.cpu().numpy() / len(valid_dataloader) / CONFIG.BATCH_SIZE
    print('Val loss:', avg_val_loss, 'Val acc:', avg_val_acc)
    return avg_val_loss, avg_val_acc

In [None]:
def predict_prob(model, dl):
    probs = []
    model.eval()
    for step, batch in tqdm(enumerate(dl),
                            total=len(dl),
                            desc='infering'):
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            logits = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            logits = logits.cpu().numpy()
            probs.extend(logits)
    probs = np.array(probs)
    return probs

In [None]:
def train_fn(model, train_dataloader, valid_dataloader, criterion, optimizer, scheduler, epoch):
    # we validate config.N_VALIDATE_DUR_TRAIN times during the training loop
    nv = CONFIG.N_VALIDATE_DUR_TRAIN
    temp = len(train_dataloader) // nv
    temp = temp - (temp % 100)
    validate_at_steps = [temp * x for x in range(1, nv + 1)]
    
    if CONFIG.USE_FGM:
        fgm = FGM(model, epsilon=1, emb_name='word_embeddings.')

    train_loss = 0
    for step, batch in tqdm(enumerate(train_dataloader),
                            total=len(train_dataloader),
                            desc='training'):
        # set model.eval() every time during training
        model.train()

        # unpack the batch contents and push them to the device (cuda or cpu).
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)

        # forward pass
        logits = model(input_ids=b_input_ids, attention_mask=b_attention_mask)

        # calculate loss
        loss = criterion(logits, b_labels)
        loss = loss / CONFIG.ACCUMULATION_STEPS
        train_loss += loss.item()

        # backward pass
        loss.backward()
        
        # fgm attack
        if CONFIG.USE_FGM:
            fgm.attack()
            logits_adv = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            loss_adv = criterion(logits_adv, b_labels)
            loss_adv.backward()
            fgm.restore()

        if (step+1) % CONFIG.ACCUMULATION_STEPS == 0:
            # update weights
            optimizer.step()
            # clear accumulated gradients
            optimizer.zero_grad()
            # update scheduler
            scheduler.step()

        if step in validate_at_steps:
            print(f'-- Step: {step}')
            _ = val_fn(model, valid_dataloader, criterion)

    avg_train_loss = train_loss / len(train_dataloader)
    print('Training loss:', avg_train_loss)

In [None]:
def metric_fn(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    return acc, f1

# kfold training

In [None]:
folds = StratifiedKFold(n_splits=CONFIG.N_FOLDS, shuffle=True)
for fold, (tr_ind, val_ind) in enumerate(folds.split(df_train, df_train['label'])):
    
    start_time = time.time()
    
    train = df_train.loc[tr_ind]
    valid = df_train.loc[val_ind]
    
    train_ds = SentiDataset(train, tr_ind)
    valid_ds = SentiDataset(valid, val_ind)
    train_dl = DataLoader(train_ds, batch_size=CONFIG.BATCH_SIZE)
    valid_dl = DataLoader(valid_ds, batch_size=CONFIG.BATCH_SIZE)
    
    torch.manual_seed(CONFIG.SEED)
    if len(CONFIG.CLASSES_WEIGHTS) > 0:
        criterion = nn.CrossEntropyLoss(weight=torch.tensor(CONFIG.CLASSES_WEIGHTS, dtype=torch.float).to(device))
    else:
        criterion = nn.CrossEntropyLoss()
    model = Model()
    model = model.to(device)
    
    if CONFIG.FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = optim.AdamW(optimizer_parameters, lr=CONFIG.LR)

    num_training_steps = len(train_dl) * CONFIG.EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=CONFIG.N_WARMUP,
        num_training_steps=num_training_steps
    )

    min_avg_val_loss = float('inf')
    for epoch in range(CONFIG.EPOCHS):
        train_fn(model, train_dl, valid_dl, criterion, optimizer, scheduler, epoch)
        avg_val_loss, _ = val_fn(model, valid_dl, criterion)

        if CONFIG.SAVE_BEST_ONLY:
            if avg_val_loss < min_avg_val_loss:
                best_model = copy.deepcopy(model)
                best_val_mse_score = avg_val_loss
                model_name = f'models/fold{fold}_best_model'
                torch.save(best_model.state_dict(), model_name + '.pt')
                print(f'--- Best Model. Val loss: {min_avg_val_loss} -> {avg_val_loss}')
                min_avg_val_loss = avg_val_loss
                
    model = Model()
    model = model.to(device)
    model.load_state_dict(torch.load(f'models/fold{fold}_best_model.pt'))
    valid_probs = predict_prob(model, valid_dl)
    valid_df = valid.copy()
    for i in range(CONFIG.NUM_CLASSES):
        valid_df[f'p{i}'] = valid_probs[:, i]
    valid_df['pred'] = valid_probs.argmax(axis=1)
    valid_df.to_pickle(f'oofs/fold{fold}_oof.pickle')
    
    acc, f1 = metric_fn(valid['label'], valid_df['pred'])
    
    used_time = time.time() - start_time
    
    print(f'fold {fold} score: acc={acc}, f1={f1} used_time: {used_time}')

In [None]:
oof = list()
for fold in range(CONFIG.N_FOLDS):
    oof.append(pd.read_pickle(f'oofs/fold{fold}_oof.pickle'))
df_oof = pd.concat(oof)
df_oof = df_oof.sort_index()

In [None]:
acc, f1 = metric_fn(df_train['label'], df_oof['pred'])
print(f'OOF acc={acc}, f1={f1}')