In [None]:
import torch
import torch.nn as nn

config = {
    'train_file_path': '/content/drive/MyDrive/data/train.csv',
    'train_val_ratio': 0.1,
    'x_max_seq_len': 4000,
    'y_max_seq_len': 64,
    'vocab_size': 30000,
    'batch_size': 16,
    'num_epoches': 3,
    'learning_rate': 1e-3,
    'logging_step': 300,
    'seed': 2021
}

config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu' 

import random
import numpy as np

def seed_everything(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

seed_everything(config['seed'])


In [None]:
from collections import Counter
from tqdm import tqdm
import jieba

def get_vocab(config):
  token_counter = Counter()

  with open(config['train_file_path'], 'r', encoding='utf8') as f:
    lines = f.readlines()
    for line in tqdm(lines, desc='Counting tokens', total=len(lines)):
      labels = line.split(',')[0].strip()
      content = line.split(',')[-1].strip()
      words_cut = list(jieba.cut(labels)) + list(jieba.cut(content))
      token_counter.update(words_cut)

  vocab = set(token for token, _ in token_counter.most_common(config['vocab_size']))
  return vocab


In [None]:
vocab = get_vocab(config)

Counting tokens:   0%|          | 0/50001 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.042 seconds.
Prefix dict has been built successfully.
Counting tokens: 100%|██████████| 50001/50001 [00:19<00:00, 2628.69it/s]


In [None]:
import bz2
def get_embedding(vocab):
  token2embedding = {}
  
  with bz2.open('/content/drive/MyDrive/sgns.weibo.word.bz2') as f:
    token_vector = f.readlines()

    meta_info = token_vector[0].split()
    print(f'{meta_info[0]} tokens in vectors file in total, vector size is {meta_info[1]}')

    for line in tqdm(token_vector[1:]):
      line = line.split()
      token = line[0].decode('utf8')
      vector = line[1:]


      if (token in vocab) and (token not in stop_words):
        token2embedding[token] = [float(num) for num in vector] # 转换数据类型     

    token2id = {token: idx for idx, token in enumerate(token2embedding.keys(),4)}
    id2embedding = {token2id[token]: embedding for token, embedding in token2embedding.items()}

    PAD, UNK, BOS, EOS = '<pad>', '<unk>', '<bos>', '<eos>'
    token2id[PAD] = 0
    token2id[UNK] = 1
    token2id[BOS] = 2
    token2id[EOS] = 3

    id2embedding[0] = [.0] * int(meta_info[1])
    id2embedding[1] = [.0] * int(meta_info[1])
    id2embedding[2] = np.random.random(int(meta_info[1])).tolist()
    id2embedding[3] = np.random.random(int(meta_info[1])).tolist()

    emb_mat = [id2embedding[idx] for idx in range(len(id2embedding))] 

    return torch.tensor(emb_mat, dtype=torch.float), token2id, len(vocab)+4

In [None]:
emb_mat, token2id, config['vocab_size'] = get_embedding(vocab)

b'195202' tokens in vectors file in total, vector size is b'300'


100%|██████████| 195202/195202 [00:05<00:00, 35009.64it/s]


In [None]:
def tokenizer(sent, token2id):
  ids = [token2id.get(token,1) for token in jieba.cut(sent)] # get() 以防出现oov报错
  return ids

In [None]:
import pandas as pd
from collections import defaultdict
def read_data(config, token2id, mode='train'):
  data_df = pd.read_csv(config[f'{mode}_file_path'], sep=',')

  if mode == 'train':
    train_df = defaultdict(list)
    val_df = defaultdict(list)

    num_val = int(config['train_val_ratio'] * len(data_df))
  
  else:
    test_df = defaultdict(list)

  for i, row in tqdm(data_df.iterrows(), desc=f'Preprocesing {mode} data', total=len(data_df)):
    label = row[0] if mode=='train' else 0
    sentence = row[-1]
    inputs = tokenizer(sentence, token2id)
    outputs = tokenizer(label, token2id)

    if mode == 'train':
      if i < num_val:
        val_df['input_ids'].append(inputs)
        val_df['labels'].append(outputs)
      else:
        train_df['input_ids'].append(inputs)
        train_df['labels'].append(outputs)

    else:
      test_df['input_ids'].append(inputs)
      test_df['labels'].append(outputs)


  if mode == 'train':

    return train_df, val_df
  
  else:

    return test_df

In [None]:
train_df, val_df  = read_data(config, token2id, mode='train')

Preprocesing train data: 100%|██████████| 50000/50000 [05:01<00:00, 165.70it/s]


In [None]:
class Collator():
  def __init__(self, x_max_seq_len, y_max_seq_len, token2id):
    self.x_max_seq_len = x_max_seq_len
    self.y_max_seq_len = y_max_seq_len
    self.token2id = token2id

  def pad_and_truncate(self, input_ids_list, labels_list, x_max_seq_len, y_max_seq_len):
    input_ids = torch.zeros((len(input_ids_list), x_max_seq_len), dtype=torch.long)
    labels = torch.zeros((len(labels_list), y_max_seq_len), dtype=torch.long)

    for i in range(len(input_ids_list)):
      x_seq_len = len(input_ids_list[i])
      y_seq_len = len(labels_list[i])

      if x_seq_len+2 <= x_max_seq_len:
        input_ids[i,:x_seq_len+2] = torch.tensor([self.token2id['<bos>']] + input_ids_list[i] + [self.token2id['<eos>']], dtype=torch.long)
      else:
        input_ids[i] = torch.tensor([self.token2id['<bos>']] + input_ids_list[i][:x_max_seq_len-2] + [self.token2id['<eos>']], dtype=torch.long)

      if y_seq_len+2 <= y_max_seq_len:
        labels[i,:y_seq_len+2] = torch.tensor([self.token2id['<bos>']] + labels_list[i] + [self.token2id['<eos>']], dtype=torch.long)
      else:
        labels[i] = torch.tensor([self.token2id['<bos>']] + labels_list[i][:y_max_seq_len-2] + [self.token2id['<eos>']], dtype=torch.long)


    return input_ids, labels

  def __call__(self, examples):
    input_ids_list, labels_list = list(zip(*examples))
    cur_x_max_seq_len = max(len(input_id) for input_id in input_ids_list)
    cur_y_max_seq_len = max(len(label) for label in labels_list)
    x_max_seq_len = min(cur_x_max_seq_len, self.x_max_seq_len)
    y_max_seq_len = min(cur_y_max_seq_len, self.y_max_seq_len)

    input_ids, labels = self.pad_and_truncate(input_ids_list,labels_list, x_max_seq_len, y_max_seq_len)
    
    data_dict = {
        'input_ids': input_ids,
        'labels': labels
    }

    return data_dict
                        

In [None]:
collate_fn = Collator(config['x_max_seq_len'], config['y_max_seq_len'], token2id)

In [None]:
import collections
from torch.utils.data import DataLoader
def build_dataloader(config):
  train_df, val_df  = read_data(config, token2id, mode='train')

  train_dataset = TSDataset(train_df)
  val_dataset = TSDataset(val_df)

  train_dataloader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=True, collate_fn=collate_fn)
  val_dataloader = DataLoader(dataset=val_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)

  return train_dataloader, val_dataloader

In [None]:
train_dataloader, val_dataloader= build_dataloader(config)

Preprocesing train data: 100%|██████████| 50000/50000 [05:07<00:00, 162.51it/s]
  cpuset_checked))


In [None]:
model_config = {
    'embedding_pretrained': emb_mat,
    'freeze_emb': True,
    'hidden_size': 512,
    'dropout': 0.3,
    'num_layers': 4,
    'rnn_type': 'lstm',   
    'emb_size': emb_mat.shape[1],
    'vocab_size': emb_mat.shape[0]
    
}

In [None]:
import torch.nn as nn
class Encoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    rnn_mapping = {'lstm': nn.LSTM, 'gru': nn.GRU}
    self.embedding = nn.Embedding.from_pretrained(config['embedding_pretrained'], freeze=config['freeze_emb'])
    self.rnns = rnn_mapping[config['rnn_type']](input_size=config['emb_size'], hidden_size=config['hidden_size'], num_layers=config['num_layers'], batch_first=True)
    self.drop = nn.Dropout(config['dropout'])


  def forward(self, x):
    # x [bs, seq_len]
    # embedded_x [bs, seq_len, d]
    embedded_x = self.drop(self.embedding(x))

    # outputs [bs, seq_len, hidden_size_out]
    # hidden [num_layers=4, bs, hidden_size_out]
    # cell [num_layers=4, bs, hidden_size_out]
    outputs, (hidden, cell) = self.rnns(embedded_x)

    return hidden, cell



In [None]:
class Decoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    rnn_mapping = {'lstm': nn.LSTM, 'gru': nn.GRU}
    self.embedding = nn.Embedding.from_pretrained(config['embedding_pretrained'], freeze=config['freeze_emb'])
    self.rnns = rnn_mapping[config['rnn_type']](input_size=config['emb_size'], hidden_size=config['hidden_size'], num_layers=config['num_layers'], batch_first=True)
    self.drop = nn.Dropout(config['dropout'])
    self.fc = nn.Linear(config['hidden_size'], config['vocab_size'])


  def forward(self, x, hidden, cell):
    # x [bs] -> [bs, 1]
    # hidden [num_layers, bs, hidden_size_out]
    # cell [num_layers, bs, hidden_size_out]
    x = x.unsqueeze(1)
    
    # embedded_x [bs, 1, d]
    embedded_x = self.drop(self.embedding(x))
    
    
    # outputs [bs, 1, hidden_size_out]
    # hidden [num_layers, bs, hidden_size_out]
    # cell [num_layers, bs, hidden_size_out]
    outputs, (hidden, cell) = self.rnns(embedded_x, (hidden, cell))
    # print(f'outputs shape:{outputs.shape}')
    # print(f'hidden shape:{hidden.shape}')
    # print(f'cell shape:{cell.shape}')

    # outputs [bs, 1, hidden_size_out] -> [bs, hidden_size_out]
    # pred [bs, vocab_size]
    pred = self.fc(outputs.squeeze(1))

    return pred, hidden, cell


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, config, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.config = config
        self.device = device
        
    def forward(self, input_ids, labels, teacher_forcing_ratio = 0.5):
        
        #input_ids = [batch size, x_seq_len]
        #labels = [batch size, y_seq_len]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = labels.shape[0]
        label_len = labels.shape[1]
        vocab_size = self.config['vocab_size']
        
        #tensor to store decoder outputs
        #outputs [ y_seq_len, batch size, vocab_size]
        outputs = torch.zeros(label_len, batch_size, vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        # hidden [num_layers, bs, hidden_size_out]
        # cell [num_layers, bs, hidden_size_out]
        hidden, cell = self.encoder(input_ids)
        
        #first input to the decoder is the <sos> tokens
        # input [bs]
        input = labels[:,0]
        
        for t in range(1, label_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            # output [bs, vocab_size]
            # hidden [num_layers, bs, hidden_size_out]
            # cell [num_layers, bs, hidden_size_out]
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            #top1 [bs]
            top1 = output.argmax(dim=-1)
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = labels[:, t] if teacher_force else top1
            # print(input.shape)

        out = (outputs, )

        if label_len > 1:
          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(outputs[1:].view(-1, vocab_size), labels.transpose(0,1).contiguous()[1:].view(-1))
          out = (loss, ) + out
        
        return out

In [None]:
enc = Encoder(model_config)
dec = Decoder(model_config)
model = Seq2Seq(enc, dec, model_config, config['device'])

In [None]:
from sklearn.metrics import f1_score
def evaluation(model, config, val_dataloader):
  model.eval()
  preds = []
  labels = []
  val_loss = 0.
  val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))

  with torch.no_grad():
    for batch in val_iterator:
      labels.append(batch['labels'])
      batch = {item: value.to(config['device']) for item, value in batch.items()}

      loss, logits = model(**batch)[:2]
      val_loss += loss.item()
      
      # preds.append(logits.argmax(dim=-1).detach().cpu())

  avg_val_loss = val_loss/len(val_dataloader)

  return avg_val_loss


In [None]:
from torch.optim import AdamW
from tqdm import trange
def train(model, config, train_dataloader, val_dataloader):
  
  model.to(config['device'])
  optimizer = AdamW(model.parameters(), lr=config['learning_rate'])
  epoches_iterator = trange(config['num_epoches'])

  global_steps = 0
  train_loss = 0.
  logging_loss = 0.

  for epoch in epoches_iterator:
    train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
    model.train()
    
    for batch in train_iterator:
      batch = {item: value.to(config['device']) for item, value in batch.items()}

      loss = model(**batch)[0]

      model.zero_grad()
      loss.backward()
      optimizer.step()

      train_loss += loss
      global_steps +=1

      if global_steps % config['logging_step'] == 0:
        print_train_loss = (train_loss - logging_loss)/ config['logging_step'] 
        logging_loss = train_loss

        avg_val_loss = evaluation(model, config, val_dataloader)
        print(avg_val_loss)
        model.train()

  return model

In [None]:
best_model = train(model, config, train_dataloader, val_dataloader)

  0%|          | 0/3 [00:00<?, ?it/s]
  cpuset_checked))

Training:   0%|          | 1/2813 [00:04<3:23:42,  4.35s/it][A
Training:   0%|          | 2/2813 [00:08<3:28:46,  4.46s/it][A
Training:   0%|          | 3/2813 [00:10<2:35:03,  3.31s/it][A
Training:   0%|          | 4/2813 [00:14<2:46:39,  3.56s/it][A
Training:   0%|          | 5/2813 [00:19<3:11:45,  4.10s/it][A
Training:   0%|          | 6/2813 [00:22<2:48:29,  3.60s/it][A
Training:   0%|          | 7/2813 [00:24<2:26:37,  3.14s/it][A
Training:   0%|          | 8/2813 [00:27<2:17:37,  2.94s/it][A
Training:   0%|          | 9/2813 [00:28<2:00:34,  2.58s/it][A
Training:   0%|          | 10/2813 [00:31<2:00:07,  2.57s/it][A
Training:   0%|          | 11/2813 [00:33<1:53:42,  2.43s/it][A
Training:   0%|          | 12/2813 [00:36<1:54:18,  2.45s/it][A
Training:   0%|          | 13/2813 [00:39<2:07:14,  2.73s/it][A
Training:   0%|          | 14/2813 [00:41<1:54:02,  2.44s/it][A
Training:   1%|          | 15/2813 [00:42

4.885181614385245



Training:  11%|█         | 301/2813 [18:52<50:39:09, 72.59s/it] [A
Training:  11%|█         | 302/2813 [18:53<35:49:37, 51.37s/it][A
Training:  11%|█         | 303/2813 [18:55<25:21:09, 36.36s/it][A
Training:  11%|█         | 304/2813 [18:58<18:18:59, 26.28s/it][A
Training:  11%|█         | 305/2813 [19:00<13:17:01, 19.07s/it][A
Training:  11%|█         | 306/2813 [19:01<9:33:28, 13.72s/it] [A
Training:  11%|█         | 307/2813 [19:05<7:34:32, 10.88s/it][A
Training:  11%|█         | 308/2813 [19:09<6:10:12,  8.87s/it][A
Training:  11%|█         | 309/2813 [19:13<5:07:02,  7.36s/it][A
Training:  11%|█         | 310/2813 [19:17<4:14:56,  6.11s/it][A
Training:  11%|█         | 311/2813 [19:18<3:22:41,  4.86s/it][A
Training:  11%|█         | 312/2813 [19:21<2:55:03,  4.20s/it][A
Training:  11%|█         | 313/2813 [19:25<2:52:16,  4.13s/it][A
Training:  11%|█         | 314/2813 [19:27<2:25:51,  3.50s/it][A
Training:  11%|█         | 315/2813 [19:31<2:30:27,  3.61s/it][A
Tr

KeyboardInterrupt: ignored