In [1]:
import numpy as np
import pandas as pd
import time
import math
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
from transformers import BertTokenizer,AutoModel,AdamW,AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.nn.functional as F
from tqdm import tqdm
import copy
import torch.nn as nn
import os
import json
import gc
import random
from collections import Counter
from torch.cuda.amp import GradScaler

scaler = GradScaler()

In [19]:
class CFG:
    input_path = './'
    model_path = 'macbert-large_300_' #  nghuyong/ernie-2.0-large-en studio-ousia/luke-large
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 300
    epochs = 6  # 5
    encoder_lr = 25e-6
    decoder_lr = 25e-6
    min_lr = 0.5e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    weight_decay = 0.01
    num_fold = 5
    batch_size = 32
    seed = 1006
    OUTPUT_DIR = './'
    num_workers = 8
    device='cuda'
    print_freq = 100

### Collate function

In [4]:
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in
                                   output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in
                                   output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["label"] = torch.tensor(output["target"], dtype=torch.long)

        return output
        
#         return torch.as_tensor(inputs['input_ids'], dtype=torch.long), \
#                torch.as_tensor(inputs['attention_mask'], dtype=torch.long), \
#                torch.as_tensor(label, dtype=torch.long)
    

tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-macbert-large')
collate_fn = Collate(tokenizer, isTrain=True)

Downloading:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/660 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]



















































































### Model

In [5]:
import torch
import torch.nn.functional as F


def kl(inputs, targets, reduction="sum"):
# """
# 计算kl散度
# inputs：tensor，logits
# targets：tensor，logits
# """
    loss = F.kl_div(F.log_softmax(inputs, dim=-1),
                    F.softmax(targets, dim=-1),
                    reduction=reduction)
    return loss


def adv_project(grad, norm_type='inf', eps=1e-6):
# """
# L0,L1,L2正则，对于扰动计算
# """
    if norm_type == 'l2':
        direction = grad / (torch.norm(grad, dim=-1, keepdim=True) + eps)
    elif norm_type == 'l1':
        direction = grad.sign()
    else:
        direction = grad / (grad.abs().max(-1, keepdim=True)[0] + eps)
    return direction


def virtual_adversarial_training(model, hidden_status, attention_mask, logits):
# """
# 虚拟对抗式训练
# model： nn.Module, 模型
# hidden_status：tensor，input的embedded表示
# token_type_ids：tensor，bert中的token_type_ids，A B 句子
# attention_mask：tensor，bert中的attention_mask，对paddding mask
# logits：tensor，input的输出
# """
    embed = hidden_status
    # 初始扰动 r
    noise = embed.data.new(embed.size()).normal_(0, 1) * 1e-5
    noise.requires_grad_()
    # x + r
    new_embed = embed.data.detach() + noise
    adv_output = model(inputs_embeds=new_embed,
                       input_ids = None,
#                        token_type_ids=token_type_ids,
                       attention_mask=attention_mask)
    adv_logits = adv_output[0]
    adv_loss = kl(adv_logits, logits.detach(), reduction="batchmean")
    delta_grad, = torch.autograd.grad(adv_loss, noise, only_inputs=True)
    norm = delta_grad.norm()

# 梯度消失，退出
    if torch.isnan(norm) or torch.isinf(norm):
        return None

    # line 6 inner sum
    noise = noise + delta_grad * 1e-3
    # line 6 projection
    noise = adv_project(noise, norm_type='l2', eps=1e-6)
    new_embed = embed.data.detach() + noise
    new_embed = new_embed.detach()
    # 在进行一次训练
    adv_output = model(inputs_embeds=new_embed,
                       input_ids = None,
#                        token_type_ids=token_type_ids,
                       attention_mask=attention_mask)
    adv_logits = adv_output[0]
    adv_loss_f = kl(adv_logits, logits.detach())
    adv_loss_b = kl(logits, adv_logits.detach())
    # 在预训练时设置为10，下游任务设置为1
    adv_loss = (adv_loss_f + adv_loss_b) * 1

    return adv_loss


In [6]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)

class FGM():
    """
    定义对抗训练方法FGM,对模型embedding参数进行扰动
    """
    def __init__(self, model, epsilon=0.25):
        self.model = model
        self.epsilon = epsilon
        self.backup = {}

    def attack(self, embed_name='word_embeddings'):
        """
        得到对抗样本
        :param emb_name:模型中embedding的参数名
        :return:
        """
        for name, param in self.model.named_parameters():
            if param.requires_grad and embed_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)

                if norm != 0 and not torch.isnan(norm):
                    r_at = self.epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, embed_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and embed_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

## 1. Read Data & EDA

In [7]:
def read_jsonfile(file_name):
    data = []
    with open(file_name) as f:
        for i in f.readlines():
            data.append(json.loads(i))
    return data

# train = pd.DataFrame(read_jsonfile(CFG.input_path + "/train.json"))
train = pd.read_csv('cleaned_all.csv', sep=',')
#add = pd.read_csv('/content/drive/MyDrive/ccf/additional_train.csv')
#train = pd.concat([train, add])
# test = pd.DataFrame(read_jsonfile("./datasets/testA.json"))
train['fold'] = train['fold'].apply(lambda x :int(x))
train.reset_index(inplace=True)

In [8]:
train.head()

Unnamed: 0,index,sentence,label,fold
0,0,前程无忧8月7日发布第二季度财新浪科技讯美国东部时间7月30日530北京时间7月30日173...,1,0
1,1,网络流言泛滥韩星崔真实能否催生网络真实网络流言泛滥引发诸多事端10月2日凌晨，韩国影星崔真实...,1,0
2,2,工信部合法前提下不限制谷歌Android平中新社北京一月二十七日电记者刘育英工业和信息化部新...,1,0
3,3,i5芯500G硬盘索尼EB18独显本报5880随着酷睿i系列处理器的普及，索尼于2010年3...,1,0
4,4,中外科学家探测到迄今已知最重反物质反氦本报讯记者许琦敏位于纽约长岛的美国布鲁克海文国家实验室...,1,0


In [9]:
skf = StratifiedKFold(n_splits=5)
for fold, (_, val_) in enumerate(skf.split(X=train, y=train.label, groups=train.label)):
    train.loc[val_, "fold"] = int(fold)

In [10]:
train.head()

Unnamed: 0,index,sentence,label,fold
0,0,前程无忧8月7日发布第二季度财新浪科技讯美国东部时间7月30日530北京时间7月30日173...,1,0
1,1,网络流言泛滥韩星崔真实能否催生网络真实网络流言泛滥引发诸多事端10月2日凌晨，韩国影星崔真实...,1,0
2,2,工信部合法前提下不限制谷歌Android平中新社北京一月二十七日电记者刘育英工业和信息化部新...,1,0
3,3,i5芯500G硬盘索尼EB18独显本报5880随着酷睿i系列处理器的普及，索尼于2010年3...,1,0
4,4,中外科学家探测到迄今已知最重反物质反氦本报讯记者许琦敏位于纽约长岛的美国布鲁克海文国家实验室...,1,0


### 1.1 Label Distribution

## 2. Build model Input and Dataset

In [11]:

# title + [sep] + assignee + [sep] + abstract
class TrainDataset(Dataset):
    def __init__(self,df,tokenizer):
        # self.title = df['title'].values
        # self.assignee = df['assignee'].values
        # self.abstract = df['abstract'].values
        self.sentence = df['sentence'].values
        # self.start = df['start'].values
        # self.mid = df['mid'].values
        # self.tail = df['tail'].values
        
        self.label = df['label'].values
        self.tokenizer = tokenizer
        # self.sep_token = tokenizer.sep_token
    def __len__(self):
        return len(self.sentence)
    def __getitem__(self, item):
        label = int(self.label[item])
        sentence = self.sentence[item]
        # assignee = self.assignee[item]
        
        # start = self.start[item]
        # mid = self.mid[item]
        # tail = self.tail[item]
        input_text =  sentence
        inputs = self.tokenizer(input_text, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': label
        }
#         return torch.as_tensor(inputs['input_ids'], dtype=torch.long), \
#                torch.as_tensor(inputs['attention_mask'], dtype=torch.long), \
#                torch.as_tensor(label, dtype=torch.long)

In [12]:
import torch
import torch.utils.data
import random

class BalancedBatchSampler(torch.utils.data.sampler.Sampler):
    def __init__(self, dataset, labels=None):
        self.labels = labels
        self.dataset = dict()
        self.balanced_max = 0
        # Save all the indices for all the classes
        for idx in range(0, len(dataset)):
            label = self._get_label(dataset, idx)
            if label not in self.dataset:
                self.dataset[label] = list()
            self.dataset[label].append(idx)
            self.balanced_max = len(self.dataset[label]) \
                if len(self.dataset[label]) > self.balanced_max else self.balanced_max
        
        # Oversample the classes with fewer elements than the max
        for label in self.dataset:
            while len(self.dataset[label]) < self.balanced_max:
                self.dataset[label].append(random.choice(self.dataset[label]))
        self.keys = list(self.dataset.keys())
        self.currentkey = 0
        self.indices = [-1]*len(self.keys)

    def __iter__(self):
        while self.indices[self.currentkey] < self.balanced_max - 1:
            self.indices[self.currentkey] += 1
            yield self.dataset[self.keys[self.currentkey]][self.indices[self.currentkey]]
            self.currentkey = (self.currentkey + 1) % len(self.keys)
        self.indices = [-1]*len(self.keys)
    
    def _get_label(self, dataset, idx, labels = None):
        if self.labels is not None:
            return self.labels[idx].item()

    def __len__(self):
        return self.balanced_max*len(self.keys)

## 3. Build Model

In [13]:
# m_path = "../input/macbert-domain-pretrain/pretrain_domain_code/checkpoint-8792/pytorch_model.bin"
# c_path = "../input/macbert-domain-pretrain/pretrain_domain_code/checkpoint-8792/config.json"
from transformers.modeling_outputs import SequenceClassifierOutput

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

α = 1
def compute_kl_loss(p, q, pad_mask=None):
    p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='none')
    q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='none')

    # pad_mask is for seq-level tasks
    if pad_mask is not None:
        p_loss.masked_fill_(pad_mask, 0.)
        q_loss.masked_fill_(pad_mask, 0.)

    # You can choose whether to use function "sum" and "mean" depending on your task
    p_loss = p_loss.sum()
    q_loss = q_loss.sum()

    loss = (p_loss + q_loss) / 2
    return loss
    

class FocalLoss(torch.nn.Module):
    """Multi-class Focal loss implementation"""

    def __init__(self, gamma=2, weight=None, ignore_index=-100):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.weight = weight
        self.ignore_index = ignore_index

    def forward(self, input, target):
        """
        input: [N, C]
        target: [N, ]
        """
        logpt = F.log_softmax(input, dim=1)
        pt = torch.exp(logpt)
        logpt = (1-pt)**self.gamma * logpt
        loss = F.nll_loss(logpt, target, self.weight, ignore_index=self.ignore_index)
        
        return loss
    
criterion = FocalLoss()

class Custom_Bert_Mean(nn.Module):
    def __init__(self,model_path,config_path):
        super().__init__()

        config = AutoConfig.from_pretrained(config_path)
        config.output_hidden_states = True
        self.base = AutoModel.from_pretrained(model_path, config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0.1)
        self.cls = nn.Linear(dim, 36)
        

    def forward(self, input_ids, attention_mask, labels=None,inputs_embeds = None):
        if inputs_embeds != None:
            base_output = self.base(inputs_embeds = inputs_embeds,
                                    attention_mask=attention_mask,
                                    )
        else:
            base_output = self.base(input_ids=input_ids,
                                    attention_mask=attention_mask,
                                    )
        hidden_states = base_output.hidden_states
        output = hidden_states[-1] # b, s ,h
#         output = self.pooler(output,attention_mask)
        outputs = self.cls(self.dropout(torch.mean(output, dim=1)))
        loss = None
        if labels is not None:
            
#             loss = F.cross_entropy(outputs, labels)
            ce_loss = criterion(outputs, labels)
            loss = ce_loss 
        return SequenceClassifierOutput(logits=outputs, loss=loss,hidden_states=hidden_states)


# tokenizer = AutoTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-TCBert-110M-Classification-Chinese')

In [14]:
# tokenizer

## 4.Build train pipeline

In [15]:
def get_score(preds, gts):
    return accuracy_score(preds, gts)
    # return f1_score(preds, gts, average='macro')

In [16]:
def train_fn(train_loader, model, optimizer, epoch, scheduler, device):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
#     fgm = FGM(model, epsilon=0.1)
    for step, batch in enumerate(train_loader):
#         label = batch[2].to(device)
#         mask = batch[1].to(device)
#         input_ids = batch[0].to(device)
        label = batch["label"].to(device)
        mask = batch["attention_mask"].to(device)
        input_ids = batch["input_ids"].to(device)
        
        batch_size = label.size(0)
        
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast():
            output = model(input_ids, mask, labels=label)
#             VAT
#         loss = output.loss
#         logits = output.logits
#         hidden_status = output.hidden_states[0]
#         with torch.cuda.amp.autocast():
#             adv_loss = virtual_adversarial_training(model, hidden_status, mask, logits)
#         loss = adv_loss * 1 + loss
        
        loss = criterion(output.logits, label)
    
        losses.update(loss.item(), batch_size)
        
#         loss.backward()
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 500)
#         optimizer.step()

#         fgm.attack()
#         with torch.cuda.amp.autocast():
#             loss_adv = model(input_ids, mask, labels=label).loss
        
#         scaler.scale(loss_adv).backward()
#         fgm.restore()
        
        scaler.step(optimizer)
        scaler.update()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg

def valid_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    labels = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
#         label = batch[2].to(device)
#         mask = batch[1].to(device)
#         input_ids = batch[0].to(device)
        
        label = batch["label"].to(device)
        mask = batch["attention_mask"].to(device)
        input_ids = batch["input_ids"].to(device)
        
        batch_size = label.size(0)
        with torch.no_grad():
            output = model(input_ids, mask, labels=label)
        loss = output.loss
        y_preds = output.logits.argmax(dim=-1)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        labels.append(label.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds)
    labels = np.concatenate(labels)
    return losses.avg, predictions, labels


def train_loop(fold, model, train_dataset, valid_dataset):
    LOGGER.info(f"========== training ==========")

    # ====================================================
    # loader
    # ====================================================
    
    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
#                               sampler=BalancedBatchSampler(train_dataset),
                              shuffle=True,collate_fn=collate_fn,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,collate_fn=collate_fn,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    #model = Custom_Bert_Simple()
    #model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    model.to(CFG.device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
        ]
        return optimizer_parameters
    
    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas,weight_decay=1e-2)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        cfg.num_warmup_steps = cfg.num_warmup_steps * num_train_steps
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps,
                num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    # criterion = torch.nn.CrossEntropyLoss(ignore_index=- 1)

    # criterion = LabelSmoothingLoss()
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, optimizer, epoch, scheduler, CFG.device)

        # eval
        avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, model, CFG.device)

        # scoring
        score = get_score(predictions, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(
            f'Epoch {epoch + 1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch + 1} - Score: {score:.4f}')


        if best_score < score:
            best_score = score
            best_predictions = predictions
            LOGGER.info(f'Epoch {epoch + 1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                       CFG.OUTPUT_DIR + "{}_best{}.pth".format(CFG.model_path.replace('/', '_'),fold))



    torch.cuda.empty_cache()
    gc.collect()
    del scheduler, optimizer, model
    return best_predictions


## 5.Build Logger

In [17]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [18]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))
LOGGER.info('===============num_warmup_steps_{}==============='.format(CFG.num_warmup_steps))



In [None]:

config = AutoConfig.from_pretrained('hfl/chinese-macbert-large') # hfl/chinese-macbert-base
config.num_labels = 5
config.output_hidden_states = True

for i in range(1):
    fold = i
#     model = Custom_Bert_Mean()
    model = AutoModelForSequenceClassification.from_pretrained('hfl/chinese-macbert-large',config = config)
#     model = nn.DataParallel(model)
    tr_data = train[train['fold']!=fold].reset_index(drop=True)
    va_data = train[train['fold']==fold].reset_index(drop=True)
#     va_data.to_csv('valid0.csv', index=None)
    tr_dataset = TrainDataset(tr_data,tokenizer)
    va_dataset =TrainDataset(va_data,tokenizer)
    val_result = train_loop(fold, model,tr_dataset, va_dataset)

Downloading:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

## Infer and Fusion

In [19]:
class TestDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.title = df['title'].values
        self.assignee = df['assignee'].values
        self.abstract = df['abstract'].values
        self.start = df['start'].values
        self.mid = df['mid'].values
        self.tail = df['tail'].values
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token

    def __len__(self):
        return len(self.title)

    def __getitem__(self, item):
        title = self.title[item]
        assignee = self.assignee[item]
        abstract = self.abstract[item]
        start = self.start[item]
        mid = self.mid[item]
        tail = self.tail[item]
        input_text = title + self.sep_token + assignee + self.sep_token + start+ self.sep_token + mid+ self.sep_token +tail
        inputs = self.tokenizer(input_text, truncation=True, max_length=400, padding='max_length')
        return torch.as_tensor(inputs['input_ids'], dtype=torch.long), \
               torch.as_tensor(inputs['attention_mask'], dtype=torch.long)

def infer(test_loader, model, device):
    model.to(device)
    model.eval()
    preds = []
    probs = []
    for step, batch in tqdm(enumerate(test_loader)):
        mask = batch[1].to(device)
        input_ids = batch[0].to(device)
        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=mask)
        logits = F.softmax(output.logits, dim=-1)
        prob, y_preds = logits.max(dim=-1)
        probs.append(prob.to('cpu').numpy())
        preds.append(y_preds.to('cpu').numpy())

    predictions = np.concatenate(preds)
    probs = np.concatenate(probs)
    return predictions, probs

def infer_5folds(test_loader, model, device):
    model.to(device)
    model.eval()
    
    probs = []
    for step, batch in tqdm(enumerate(test_loader)):
        mask = batch[1].to(device)
        input_ids = batch[0].to(device)
        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=mask)
        logits = F.softmax(output.logits, dim=-1)
        probs.append(logits.to('cpu').numpy())
        # prob, y_preds = logits.max(dim=-1)
        # probs.append(prob.to('cpu').numpy())
        # preds.append(y_preds.to('cpu').numpy())

    predictions = np.concatenate(probs)
    #probs = np.concatenate(probs)
    return predictions #, probs

res = []
for fold in range(1):
#     saved_path = f"../input/hflrobertabaselineoutput/macbert_base_best{fold}.pth"
    model_path = "../input/macbert-large/pytorch_model.bin"
    config_path = "../input/macbert-large/config.json"
    model = Custom_Bert_Mean(model_path,config_path)
    print(f"Inference of model {fold} is starting")
#     model = AutoModelForSequenceClassification.from_pretrained('hfl/chinese-macbert-base',num_labels=36)
    tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-macbert-large')
#     model.load_state_dict(torch.load(saved_path)['model'])
    test_dataset = TestDataset(test, tokenizer)
    test_dataloader = DataLoader(test_dataset,
                                batch_size=CFG.batch_size * 2,
                                shuffle=False,
                                num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    result_1fold = infer_5folds(test_dataloader, model, CFG.device)
    res.append(result_1fold)
# res = np.mean(res, axis=1)
# res = np.argmax(res, axis=-1)

res = np.array(res)
res = np.mean(res,axis=0)

res_1 = []
for fold in range(1):
#     saved_path = f"../input/hflrobertabaselineoutput/macbert_base_best{fold}.pth"
    model_path = "../input/tcbert-large/pytorch_model.bin"
    config_path  ="../input/tcbert-large/config.json"
    model = Custom_Bert_Mean(model_path,config_path)
    print(f"Inference of model {fold} is starting")
#     model = AutoModelForSequenceClassification.from_pretrained('hfl/chinese-macbert-base',num_labels=36)
    tokenizer = AutoTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-TCBert-330M-Classification-Chinese')
#     model.load_state_dict(torch.load(saved_path)['model'])
    test_dataset = TestDataset(test, tokenizer)
    test_dataloader = DataLoader(test_dataset,
                                batch_size=CFG.batch_size * 2,
                                shuffle=False,
                                num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    result_1fold = infer_5folds(test_dataloader, model, CFG.device)
    res_1.append(result_1fold)
# res = np.mean(res, axis=1)
# res = np.argmax(res, axis=-1)

res_1 = np.array(res_1)
res_1 = np.mean(res_1,axis=0)


final = np.argmax(0.5 *res + 0.5 * res_1,-1)

test['label'] = final

test = test[['id', 'label']]
test.to_csv('submit_Macbert_base_VAT_mul2_newFusion1.csv', index=None)

OSError: Can't load the configuration of '../input/macbert-large/config.json'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '../input/macbert-large/config.json' is the correct path to a directory containing a config.json file

In [1]:
import jieba

ModuleNotFoundError: No module named 'jieba'