In [None]:
import torch
import torch.nn as nn

config = {
    'train_file_path': '/content/drive/MyDrive/data/train.csv',
    'model_path': '/content/drive/MyDrive/BERT_model',
    'train_val_ratio': 0.1,
    'x_max_seq_len': 512,
    'y_max_seq_len': 64,
    'vocab_size': 30000,
    'batch_size': 8,
    'num_epoches': 3,
    'learning_rate': 1e-3,
    'logging_step': 300,
    'seed': 2021
}

config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu' 

import random
import numpy as np

def seed_everything(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

seed_everything(config['seed'])


In [None]:
from collections import Counter
from tqdm import tqdm
import jieba

def get_vocab(config):
  token_counter = Counter()

  with open(config['train_file_path'], 'r', encoding='utf8') as f:
    lines = f.readlines()
    for line in tqdm(lines, desc='Counting tokens', total=len(lines)):
      labels = line.split(',')[0].strip()
      content = line.split(',')[-1].strip()
      words_cut = list(jieba.cut(labels)) + list(jieba.cut(content))
      token_counter.update(words_cut)

  vocab = set(token for token, _ in token_counter.most_common(config['vocab_size']))
  return vocab

In [None]:
vocab = get_vocab(config)

Counting tokens:   0%|          | 0/50001 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.047 seconds.
Prefix dict has been built successfully.
Counting tokens: 100%|██████████| 50001/50001 [00:17<00:00, 2894.10it/s]


In [None]:
import bz2
def get_embedding(vocab):
  token2embedding = {}
  
  with bz2.open('/content/drive/MyDrive/sgns.weibo.word.bz2') as f:
    token_vector = f.readlines()

    meta_info = token_vector[0].split()
    print(f'{meta_info[0]} tokens in vectors file in total, vector size is {meta_info[1]}')

    for line in tqdm(token_vector[1:]):
      line = line.split()
      token = line[0].decode('utf8')
      vector = line[1:]


      if (token in vocab) and (token not in stop_words):
        token2embedding[token] = [float(num) for num in vector] # 转换数据类型     

    token2id = {token: idx for idx, token in enumerate(token2embedding.keys(),4)}
    id2embedding = {token2id[token]: embedding for token, embedding in token2embedding.items()}

    PAD, UNK, BOS, EOS = '<pad>', '<unk>', '<bos>', '<eos>'
    token2id[PAD] = 0
    token2id[UNK] = 1
    token2id[BOS] = 2
    token2id[EOS] = 3

    id2embedding[0] = [.0] * int(meta_info[1])
    id2embedding[1] = [.0] * int(meta_info[1])
    id2embedding[2] = np.random.random(int(meta_info[1])).tolist()
    id2embedding[3] = np.random.random(int(meta_info[1])).tolist()

    emb_mat = [id2embedding[idx] for idx in range(len(id2embedding))] 

    return torch.tensor(emb_mat, dtype=torch.float), token2id, len(vocab)+4

In [None]:
emb_mat, token2id, config['vocab_size'] = get_embedding(vocab)

b'195202' tokens in vectors file in total, vector size is b'300'


100%|██████████| 195202/195202 [00:04<00:00, 40224.06it/s]


In [None]:
import pandas as pd
from collections import defaultdict
def read_data(config, tokenizer, mode='train'):
  data_df = pd.read_csv(config[f'{mode}_file_path'], sep=',')

  # data_df = data_df.head(2000)

  if mode == 'train':
    train_df = defaultdict(list)
    val_df = defaultdict(list)

    num_val = int(config['train_val_ratio'] * len(data_df))
  
  else:
    test_df = defaultdict(list)

  for i, row in tqdm(data_df.iterrows(), desc=f'Preprocesing {mode} data', total=len(data_df)):
    label = row[0] if mode=='train' else 0
    sentence = row[-1]
    #-----------------------#
    inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True)
    outputs = tokenizer.encode(label,add_special_tokens=True)
    #-----------------------#

    if mode == 'train':
      if i < num_val:
        val_df['input_ids'].append(inputs['input_ids'])
        val_df['token_type_ids'].append(inputs['token_type_ids'])
        val_df['attention_mask'].append(inputs['attention_mask'])
        val_df['labels'].append(outputs)

      else:
        train_df['input_ids'].append(inputs['input_ids'])
        train_df['token_type_ids'].append(inputs['token_type_ids'])
        train_df['attention_mask'].append(inputs['attention_mask'])
        train_df['labels'].append(outputs)


    else:
      test_df['input_ids'].append(inputs['input_ids'])
      test_df['token_type_ids'].append(inputs['token_type_ids'])
      test_df['attention_mask'].append(inputs['attention_mask'])
      test_df['labels'].append(outputs)


  if mode == 'train':

    return train_df, val_df
  
  else:

    return test_df

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(config['model_path'])

In [None]:
from torch.utils.data import Dataset
class TSDataset(Dataset):
  def __init__(self, data_dict):
    super(TSDataset, self).__init__()
    self.data_dict = data_dict

  def __getitem__(self, idx):

    data = (self.data_dict['input_ids'][idx],
            self.data_dict['token_type_ids'][idx],
            self.data_dict['attention_mask'][idx],
            self.data_dict['labels'][idx]
            )
    return data

  def __len__(self):
    return len(self.data_dict['input_ids'])


In [None]:
class Collator():
  def __init__(self, x_max_seq_len, y_max_seq_len, tokenizer):
    self.x_max_seq_len = x_max_seq_len
    self.y_max_seq_len = y_max_seq_len
    self.tokenizer = tokenizer

  def pad_and_truncate(self, input_ids_list, token_type_ids_list, attention_mask_list, labels_list, x_max_seq_len, y_max_seq_len):
    input_ids = torch.zeros((len(input_ids_list), x_max_seq_len), dtype=torch.long)
    token_type_ids = torch.zeros_like(input_ids)
    attention_mask = torch.zeros_like(input_ids)
    labels = torch.zeros((len(labels_list), y_max_seq_len), dtype=torch.long)

    for i in range(len(input_ids_list)):
      x_seq_len = len(input_ids_list[i])
      y_seq_len = len(labels_list[i])

      if x_seq_len <= x_max_seq_len:
        input_ids[i,:x_seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long)
        token_type_ids[i,:x_seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
        attention_mask[i,:x_seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)

      else:
        input_ids[i] = torch.tensor(input_ids_list[i][:x_max_seq_len-1] + [self.tokenizer.sep_token_id], dtype=torch.long)
        token_type_ids[i] = torch.tensor(token_type_ids_list[i][:x_max_seq_len], dtype=torch.long)
        attention_mask[i] = torch.tensor(attention_mask_list[i][:x_max_seq_len], dtype=torch.long)

      if y_seq_len <= y_max_seq_len:
        labels[i,:y_seq_len] = torch.tensor(labels_list[i], dtype=torch.long)
      else:
        labels[i] = torch.tensor(labels_list[i][:y_max_seq_len-1] + [self.tokenizer.sep_token_id], dtype=torch.long)


    return input_ids, token_type_ids, attention_mask, labels

  def __call__(self, examples):
    input_ids_list, token_type_ids_list, attention_mask_list, labels_list = list(zip(*examples))
    cur_x_max_seq_len = max(len(input_id) for input_id in input_ids_list)
    cur_y_max_seq_len = max(len(label) for label in labels_list)
    x_max_seq_len = min(cur_x_max_seq_len, self.x_max_seq_len)
    y_max_seq_len = min(cur_y_max_seq_len, self.y_max_seq_len)

    input_ids, token_type_ids, attention_mask, labels = self.pad_and_truncate(input_ids_list, token_type_ids_list, attention_mask_list,labels_list, x_max_seq_len, y_max_seq_len)
    
    data_dict = {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

    return data_dict
                        

In [None]:
collate_fn = Collator(config['x_max_seq_len'], config['y_max_seq_len'], tokenizer)

In [None]:
import collections
from torch.utils.data import DataLoader
def build_dataloader(config, tokenizer):
  train_df, val_df  = read_data(config, tokenizer, mode='train')

  train_dataset = TSDataset(train_df)
  val_dataset = TSDataset(val_df)

  train_dataloader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=True, collate_fn=collate_fn)
  val_dataloader = DataLoader(dataset=val_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)
 
  return train_dataloader, val_dataloader

In [None]:
train_dataloader, val_dataloader= build_dataloader(config, tokenizer)

Preprocesing train data: 100%|██████████| 50000/50000 [09:03<00:00, 92.02it/s]
  cpuset_checked))


In [None]:
model_config = {
    'embedding_pretrained': emb_mat,
    'freeze_emb': True,
    'hidden_size': 768,
    'dropout': 0.3,
    'num_layers': 4,
    'rnn_type': 'lstm',   
    'emb_size': emb_mat.shape[1],
    'vocab_size': emb_mat.shape[0]
    
}

In [None]:
from transformers import BertModel, BertPreTrainedModel, BertConfig
import torch.nn as nn
class BertAsEncoder(BertPreTrainedModel):
  def __init__(self, config, model_path):
    super(BertAsEncoder, self).__init__(config)
    
    self.bert_config = config

    self.bert = BertModel.from_pretrained(model_path, config=self.bert_config)

    

  def forward(self, input_ids, token_type_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, 
                        attention_mask=attention_mask, return_dict=False)

    
    # pooler_output [bs, hidden_size]
    pooler_output = outputs[1]

    return pooler_output


In [None]:
class Decoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    rnn_mapping = {'lstm': nn.LSTM, 'gru': nn.GRU}
    self.embedding = nn.Embedding.from_pretrained(config['embedding_pretrained'], freeze=config['freeze_emb'])
    self.rnns = rnn_mapping[config['rnn_type']](input_size=config['emb_size'], hidden_size=config['hidden_size'], num_layers=config['num_layers'], batch_first=True)
    self.drop = nn.Dropout(config['dropout'])
    self.fc = nn.Linear(config['hidden_size'], config['vocab_size'])


  def forward(self, x, hidden, cell):
    # x [bs] -> [bs, 1]
    # hidden [num_layers, bs, hidden_size_out]
    # cell [num_layers, bs, hidden_size_out]
    x = x.unsqueeze(1)
    
    # embedded_x [bs, 1, d]
    embedded_x = self.drop(self.embedding(x))
    
    
    # outputs [bs, 1, hidden_size_out]
    # hidden [num_layers, bs, hidden_size_out]
    # cell [num_layers, bs, hidden_size_out]
    outputs, (hidden, cell) = self.rnns(embedded_x, (hidden, cell))

    # outputs [bs, 1, hidden_size_out] -> [bs, hidden_size_out]
    # pred [bs, vocab_size]
    pred = self.fc(outputs.squeeze(1))

    return pred, hidden, cell


In [None]:
from torch._C import TensorType
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, config, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.config = config
        self.device = device
        
    def forward(self, input_ids, token_type_ids, attention_mask, labels, teacher_forcing_ratio = 0.5):
        
        #input_ids = [batch size, x_seq_len]
        #labels = [batch size, y_seq_len]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = labels.shape[0]
        label_len = labels.shape[1]
        vocab_size = self.config['vocab_size']
        
        #tensor to store decoder outputs
        #outputs [ y_seq_len, batch size, vocab_size]
        outputs = torch.zeros(label_len, batch_size, vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        # hidden [bs, hidden_size_out]->[num_layers, bs, hidden_size_out]
        # cell [num_layers, bs, hidden_size_out]
        pooler_output = self.encoder(input_ids, token_type_ids, attention_mask)
        # pooler_output = pooler_output.pooler_output
        
        # hidden = torch.zeros(self.config['num_layers'], pooler_output.shape[0], pooler_output.shape[1]).to(self.device)
        # for i in range(self.config['num_layers']):
        #   hidden[i] = pooler_output
        ones = torch.ones(self.config['num_layers'], pooler_output.shape[0], pooler_output.shape[1]).to(self.device)
        hidden = ones * pooler_output

        cell = torch.zeros_like(hidden)
        
        #first input to the decoder is the <bos> tokens
        # input [bs]
        input = labels[:,0]
        
        for t in range(1, label_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            # output [bs, vocab_size]
            # hidden [num_layers, bs, hidden_size_out]
            # cell [num_layers, bs, hidden_size_out]
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            #top1 [bs]
            top1 = output.argmax(dim=-1)
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = labels[:, t] if teacher_force else top1
            # print(input.shape)

        out = (outputs, )

        if label_len > 1:
          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(outputs[1:].view(-1, vocab_size), labels.transpose(0,1).contiguous()[1:].view(-1))
          out = (loss, ) + out
        
        return out

In [None]:
bert_config = BertConfig.from_pretrained(config['model_path'])
enc = BertAsEncoder(bert_config, config['model_path'])
dec = Decoder(model_config)
model = Seq2Seq(enc, dec, model_config, config['device'])

Some weights of the model checkpoint at /content/drive/MyDrive/代码实战/头条新闻分类/BERT_model were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from sklearn.metrics import f1_score
def evaluation(model, config, val_dataloader):
  model.eval()
  preds = []
  labels = []
  val_loss = 0.
  val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))

  with torch.no_grad():
    for batch in val_iterator:
      labels.append(batch['labels'])
      batch = {item: value.to(config['device']) for item, value in batch.items()}

      loss, logits = model(**batch)[:2]
      val_loss += loss.item()
      
      # preds.append(logits.argmax(dim=-1).detach().cpu())

  avg_val_loss = val_loss/len(val_dataloader)
  return avg_val_loss


In [None]:
from torch.optim import AdamW
from tqdm import trange
def train(model, config, train_dataloader, val_dataloader):
  
  model.to(config['device'])
  optimizer = AdamW(model.parameters(), lr=config['learning_rate'])
  epoches_iterator = trange(config['num_epoches'])

  global_steps = 0
  train_loss = 0.
  logging_loss = 0.

  for epoch in epoches_iterator:
    train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
    model.train()
    
    for batch in train_iterator:
      batch = {item: value.to(config['device']) for item, value in batch.items()}

      loss = model(**batch)[0]

      model.zero_grad()
      loss.backward()
      optimizer.step()

      train_loss += loss
      global_steps +=1

      if global_steps % config['logging_step'] == 0:
        print_train_loss = (train_loss - logging_loss)/ config['logging_step'] 
        logging_loss = train_loss

        avg_val_loss = evaluation(model, config, val_dataloader)
        print(avg_val_loss)
        model.train()

  return model

In [None]:
best_model = train(model, config, train_dataloader, val_dataloader)

  0%|          | 0/3 [00:00<?, ?it/s]
  cpuset_checked))

Training:   0%|          | 1/5625 [00:02<3:55:13,  2.51s/it][A
Training:   0%|          | 2/5625 [00:04<3:32:46,  2.27s/it][A
Training:   0%|          | 3/5625 [00:06<3:25:48,  2.20s/it][A
Training:   0%|          | 4/5625 [00:08<3:20:44,  2.14s/it][A
Training:   0%|          | 5/5625 [00:10<3:15:59,  2.09s/it][A
Training:   0%|          | 6/5625 [00:12<3:13:45,  2.07s/it][A
Training:   0%|          | 7/5625 [00:14<3:13:32,  2.07s/it][A
Training:   0%|          | 8/5625 [00:16<3:10:52,  2.04s/it][A
Training:   0%|          | 9/5625 [00:18<3:09:21,  2.02s/it][A
Training:   0%|          | 10/5625 [00:20<3:09:11,  2.02s/it][A
Training:   0%|          | 11/5625 [00:22<3:08:05,  2.01s/it][A
Training:   0%|          | 12/5625 [00:24<3:09:21,  2.02s/it][A
Training:   0%|          | 13/5625 [00:26<3:09:48,  2.03s/it][A
Training:   0%|          | 14/5625 [00:28<3:08:31,  2.02s/it][A
Training:   0%|          | 15/5625 [00:30

5.612764175415039



Training:   5%|▌         | 301/5625 [17:33<136:05:24, 92.02s/it] [A
Training:   5%|▌         | 302/5625 [17:35<96:08:13, 65.02s/it] [A
Training:   5%|▌         | 303/5625 [17:37<68:12:53, 46.14s/it][A
Training:   5%|▌         | 304/5625 [17:39<48:38:23, 32.91s/it][A
Training:   5%|▌         | 305/5625 [17:41<34:57:45, 23.66s/it][A
Training:   5%|▌         | 306/5625 [17:43<25:23:49, 17.19s/it][A
Training:   5%|▌         | 307/5625 [17:45<18:40:11, 12.64s/it][A
Training:   5%|▌         | 308/5625 [17:48<14:00:37,  9.49s/it][A
Training:   5%|▌         | 309/5625 [17:50<10:41:53,  7.24s/it][A
Training:   6%|▌         | 310/5625 [17:52<8:23:58,  5.69s/it] [A
Training:   6%|▌         | 311/5625 [17:54<6:49:24,  4.62s/it][A
Training:   6%|▌         | 312/5625 [17:56<5:42:02,  3.86s/it][A
Training:   6%|▌         | 313/5625 [17:58<4:56:26,  3.35s/it][A
Training:   6%|▌         | 314/5625 [18:00<4:21:07,  2.95s/it][A
Training:   6%|▌         | 315/5625 [18:02<3:58:29,  2.69s/it]

KeyboardInterrupt: ignored