## Wandb

In [1]:
!wandb login ***

wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\Mubuky\.netrc


## Import packages

In [2]:
import os
import re
import torch
import wandb
import random
import warnings
import collections
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F

from tqdm.notebook import tqdm
from sklearn.metrics import f1_score
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification

In [3]:
warnings.filterwarnings("ignore")

## Configurations

In [4]:
dtype = torch.FloatTensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = {
	'seed': 5201314,
	'batch_size': 16,
	'learning_rate': 1e-5,
	'num_workers': 0,
	'save_path': './models/',
#	'output_path': './pred.csv',
	'n_epochs': 3,
    'padding_length': 512,
    'num_classes': 4  # -2, -1, 0, 1
}

In [5]:
wandb.init(
    project = "SA",
    name = "BERT",
    config = config
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: mubuky (acsdf). Use `wandb login --relogin` to force relogin


## Read CSV

In [6]:
data_path = './data/'
df_train = pd.read_csv(data_path + 'train.csv')
df_valid = pd.read_csv(data_path + 'valid.csv')
df_testa = pd.read_csv(data_path + 'testa.csv')

In [7]:
df_train = df_train.loc[ : , ['content', 'service_waiters_attitude']]
df_valid = df_valid.loc[ : , ['content', 'service_waiters_attitude']]
df_testa = df_testa.loc[ : , ['content', 'service_waiters_attitude']]

## Pretreatment

### 繁体转简体

In [8]:
from opencc import OpenCC

cc = OpenCC('t2s')

In [9]:
df_train['content'] = df_train['content'].apply(lambda x:cc.convert(x))
df_valid['content'] = df_valid['content'].apply(lambda x:cc.convert(x))
df_testa['content'] = df_testa['content'].apply(lambda x:cc.convert(x))

### 正则

In [10]:
def regular_sentence(content):
    decimal_regex = re.compile(r"[^a-zA-Z]\d+")

    content = content.replace("\r\n", " ").replace("\n", " ")
    content = decimal_regex.sub(r"", content)

    #return "".join(re.findall('[\u4e00-\u9fa5]+', content, re.S))
    return content

In [11]:
df_train['content'] = df_train['content'].apply(lambda x:regular_sentence(x))
df_valid['content'] = df_valid['content'].apply(lambda x:regular_sentence(x))
df_testa['content'] = df_testa['content'].apply(lambda x:regular_sentence(x))

## Set Seeds

In [12]:
def same_seed(seed):
#    torch.use_deterministic_algorithms(True)
#    torch.backends.cudnn.enabled = False
#    torch.backends.cudnn.benchmark = False
    
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

#    os.environ['PYTHONHASHSEED'] = str(seed)
#    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

same_seed(config['seed'])

#def seed_worker(worker_id):
#    worker_seed = torch.initial_seed() % 2**32
#    numpy.random.seed(worker_seed)
#    random.seed(worker_seed)

#g = torch.Generator()
#g.manual_seed(config['seed'])

## Data

In [13]:
sentences_train = list(df_train['content'])
sentences_valid = list(df_valid['content'])
sentences_testa = list(df_testa['content'])
labels_train = list(df_train['service_waiters_attitude'])
labels_train = [i + 2 for i in labels_train]
labels_valid = list(df_valid['service_waiters_attitude'])
labels_valid = [i + 2 for i in labels_valid]

In [14]:
class SADataset(Data.Dataset):
    def __init__(self, data, labels = None):
        self.data = data
        self.labels = torch.LongTensor(labels)
    def __getitem__(self, index):
        data = self.data[index]
        label = self.labels[index]
        return data, label
    def __len__(self):
        return len(self.data)

In [15]:
train_dataset = SADataset(sentences_train, labels_train)
valid_dataset = SADataset(sentences_valid, labels_valid)

In [16]:
train_loader = Data.DataLoader(
    dataset=train_dataset,
    batch_size=config['batch_size'],
    shuffle=True,
    num_workers=config['num_workers'],
    drop_last=True,
#    worker_init_fn=seed_worker,
#    generator=g,
)
valid_loader = Data.DataLoader(
    dataset=valid_dataset,
    batch_size=config['batch_size'],
    shuffle=True,
    num_workers=config['num_workers'],
    drop_last=True,
#    worker_init_fn=seed_worker,
#    generator=g,
)

## Training

In [17]:
def get_f1_score(labels, pred):
    return f1_score(labels, pred, labels=[0, 1, 2, 3], average='macro')

In [18]:
def trainer(train_loader, valid_loader, tokenizer, model, config, device):
    #criterion = AMSoftmax()
    criterion = nn.CrossEntropyLoss()#.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'])
#	scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
#		optimizer,
#		eta_min=config['learning_rate']/50.0,
#		T_0=config['n_epochs']
#	)

    if not os.path.isdir(config['save_path']):
        os.mkdir(config['save_path'])
    
    best_f1 = 0.0
    n_epochs = config['n_epochs']
    
    for epoch in range(n_epochs):

        # train
        model.train()
        acc_record, loss_record, record_count = 0.0, 0.0, 0
        prediction = []
        groundtruth = []
        train_pbar = tqdm(train_loader, position=0, leave=True)
        train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
        
        for data, labels in train_pbar:
            optimizer.zero_grad()

            data = tokenizer(data, max_length = 108, padding = True, truncation = True, return_tensors = 'pt')
            data = data.to(device)
            labels = labels.to(device)

            pred = model(**data, labels = labels)
            pred = pred[1]
            
            loss = criterion(pred, labels)
            pred_flate = pred.argmax(dim = 1)
            acc = torch.mean((pred_flate == labels).float())
            f1 = get_f1_score(labels.tolist(), pred_flate.tolist())
            prediction += pred_flate.tolist()
            groundtruth += labels.tolist()
            
            loss.backward()
            optimizer.step()
            
            record_count += 1
            loss_record += loss.item()
            acc_record += acc.item()
            train_pbar.set_postfix({'loss': loss.item(), 'acc': acc.item(), 'f1': f1})
            wandb.log({"train/acc": acc, "train/loss": loss, "train/f1": f1})
            
        train_acc = acc_record / record_count
        train_loss = loss_record / record_count
        train_f1 = get_f1_score(groundtruth, prediction)
        
        print('TRAIN: epoch:{}, loss:{:.3f}, acc:{:.3f}, f1_score:{:.3f}'.format(epoch + 1, train_loss, train_acc, train_f1))

        # valid
        model.eval()
        
        acc_record, loss_record, record_count = 0.0, 0.0, 0
        prediction = []
        groundtruth = []
        valid_pbar = tqdm(valid_loader, position=0, leave=True)
        valid_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
        for data, labels in valid_pbar:

            data = tokenizer(data, max_length = config['padding_length'], padding = True, truncation = True, return_tensors = 'pt')
            data = data.to(device)
            labels = labels.to(device)

            with torch.no_grad():
                pred = model(**data, labels = labels)
                pred = pred[1]
                
                loss = criterion(pred, labels)
                pred_flate = pred.argmax(dim=1)
                acc = torch.mean((pred_flate == labels).float())
                f1 = get_f1_score(labels.tolist(), pred_flate.tolist())
                prediction += pred_flate.tolist()
                groundtruth += labels.tolist()

            record_count += 1
            loss_record += loss.item()
            acc_record += acc.item()
            valid_pbar.set_postfix({'loss': loss.item(), 'acc': acc.item(), 'f1': f1})
        
        valid_acc = acc_record / record_count
        valid_loss = loss_record / record_count
        valid_f1 = get_f1_score(groundtruth, prediction)

        #scheduler.step()

        print('VALID: epoch:{}, loss:{:.3f}, acc:{:.3f}, f1_score:{:.3f}'.format(epoch + 1, valid_loss, valid_acc, valid_f1))
        
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            torch.save(model.state_dict(), config['save_path'] + 'model.ckpt')
            print('Saving model with f1 {:.5f}'.format(best_f1))


In [19]:
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-chinese')

In [20]:
bertconfig = BertConfig.from_pretrained('google-bert/bert-base-chinese', num_labels = config['num_classes'])

In [21]:
model = BertForSequenceClassification.from_pretrained('google-bert/bert-base-chinese', config = bertconfig).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
trainer(train_loader, valid_loader, tokenizer, model, config, device)

  0%|          | 0/6562 [00:00<?, ?it/s]

TRAIN: epoch:1, loss:0.941, acc:0.607, f1_score:0.493


  0%|          | 0/937 [00:00<?, ?it/s]

VALID: epoch:1, loss:0.584, acc:0.813, f1_score:0.733
Saving model with f1 0.73267


  0%|          | 0/6562 [00:00<?, ?it/s]

TRAIN: epoch:2, loss:0.880, acc:0.635, f1_score:0.538


  0%|          | 0/937 [00:00<?, ?it/s]

VALID: epoch:2, loss:0.519, acc:0.834, f1_score:0.756
Saving model with f1 0.75631


  0%|          | 0/6562 [00:00<?, ?it/s]

TRAIN: epoch:3, loss:0.825, acc:0.660, f1_score:0.578


  0%|          | 0/937 [00:00<?, ?it/s]

VALID: epoch:3, loss:0.554, acc:0.810, f1_score:0.736


In [23]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/acc,▃▁▅▅▅▃▃▄▁▃▄▅▂▆▄▄▁█▃▂▅▂▅▄▆▅▄▇▃▄▅▄▃▂▄▅▁▄▄▂
train/f1,▃▂▃▁▅▂▂▃▂▂▃▅▁▂▅▂▂█▂▂▃▁▃▄▃▄▂█▂▂▆▄▂▄▄▂▂▃▁▁
train/loss,▆█▅▄▄▇█▃▆▇▅▆▆▃▄▃▆▂▆▆▅▆▄▄▃▆▄▁▅▅▅▄▆▇▅▆█▆▄▆

0,1
train/acc,0.6875
train/f1,0.48397
train/loss,0.79499
