In [3]:
!pip install transformers



In [0]:
import os
path = "/content/drive/My Drive/NLP/sentiment_compete"
os.chdir(path)

In [0]:
import torch
import pandas as pd
import random
import numpy as np
from tqdm import tqdm

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
train_file_path = 'data/train_weibo_clean.csv'
train09_file_path = 'data/train_weibo_09.csv'
dev01_file_path = 'data/train_weibo_01.csv'
bert_model_path = 'robert_base_chinese/pytorch_model.bin'
bert_config_path = 'robert_base_chinese/bert_config.json'
bert_vocab_path = 'robert_base_chinese/vocab.txt'
max_seq_len = 140
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
from transformers import BertTokenizer,BertConfig

In [7]:
bert_config = BertConfig.from_pretrained(bert_config_path, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(bert_vocab_path)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [0]:
def split_data(file_path, rate=0.9):
  df = pd.read_csv(file_path)
  df = df.iloc[np.random.permutation(len(df))]
  train_df = df.iloc[:int(len(df)*rate)]
  dev_df = df.iloc[int(len(df)*rate):]
  train_df.to_csv('data/train_weibo_09.csv', index=False)
  dev_df.to_csv('data/train_weibo_01.csv', index=False)

In [0]:
 def return_id(str1, truncation_strategy, length):

        inputs = tokenizer.encode_plus(str1,
            add_special_tokens=True,
            max_length=length,
            truncation_strategy=truncation_strategy)
        
        input_ids =  inputs["input_ids"]
        input_masks = [1] * len(input_ids)
        input_segments = inputs["token_type_ids"]
        padding_length = length - len(input_ids)
        padding_id = tokenizer.pad_token_id
        input_ids = input_ids + ([padding_id] * padding_length)
        input_masks = input_masks + ([0] * padding_length)
        input_segments = input_segments + ([0] * padding_length)
        
        return [input_ids, input_masks, input_segments]
        
def load_data(file_path, tokenizer, max_seq_len, device):
  df = pd.read_csv(file_path)
  df = df[df['情感倾向'].isin(['0', '1','-1'])]
  df = df[['微博中文内容', '情感倾向']]
  inputs = tokenize_data(df, tokenizer, '微博中文内容', max_seq_len, device)
  outputs = torch.tensor(data=df['情感倾向'].astype(int) + 1, dtype=torch.long, device=device)
  return inputs, outputs

def tokenize_data(df, tokenizer, column, max_seq_len, device):
  input_ids = []
  attention_mask = []
  token_type_ids = []
  for content in tqdm(df[column]):
    inputs = tokenizer.encode_plus(text=str(content),
                    add_special_tokens=True,
                    max_length=max_seq_len,
                    truncation_strategy="longest_first",
                    return_attention_mask=True,
                    pad_to_max_length=True)
    ids, masks, token_type_id = inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids']
    input_ids.append(ids)
    attention_mask.append(masks)
    token_type_ids.append(token_type_id) # 3 n len
  return torch.tensor(data=[input_ids, attention_mask, token_type_ids], device=device).permute(1, 0, 2)


In [10]:
train_data = load_data(train09_file_path, tokenizer, max_seq_len, device)

100%|██████████| 89604/89604 [00:44<00:00, 2006.08it/s]


In [11]:
dev_data = load_data(dev01_file_path, tokenizer, max_seq_len, device)

100%|██████████| 9956/9956 [00:04<00:00, 2008.87it/s]


In [0]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel


class RoBertCLSLAST3(nn.Module):
    def __init__(self, bert_model_path, config, seq_len):
        super(RoBertCLSLAST3, self).__init__()
        self.bert_model = BertModel.from_pretrained(bert_model_path, config=config)
        self.predict_fc = nn.Sequential(nn.Dropout(p=0.2),
                        nn.Linear(config.hidden_size * 2, 512),
                        nn.Tanh(),
                        nn.Dropout(p=0.2),
                        nn.Linear(512, 3)
                        )
        self.predict_fc.apply(self.init_network)
        #for param in self.bert_model.parameters():
        #    param.requires_grad = False
        self.dropout = nn.Dropout(0.15)

    def forward(self, inputs):
        #(batch_size, 3, seq_max_len)i
        input_ids = inputs[:, 0] # (batch_size, seq_max_len)
        attention_mask = inputs[:, 1] # (batch_size, seq_max_len)
        token_type_ids = inputs[:, 2] # (batch_size, seq_max_len)
        #(batch_size, sequence_length, hidden_size)
        sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        attention_mask_pad = attention_mask.unsqueeze(2).permute(0, 2, 1) # (batch_size, 1, seq_len)
        """
        seq_lens = torch.sum(attention_mask, dim=1).unsqueeze(1)# (batch_size, 1)
        h13 = self.get_mean(attention_mask_pad, hidden_states[-1], seq_lens)# (batch_size, hidden_size)
        h12 = self.get_mean(attention_mask_pad, hidden_states[-2], seq_lens)# (batch_size, hidden_size)
        h11 = self.get_mean(attention_mask_pad, hidden_states[-3], seq_lens)# (batch_size, hidden_size)
        h10 = self.get_mean(attention_mask_pad, hidden_states[-4], seq_lens)# (batch_size, hidden_size)
        concat_hidden = torch.cat([h13, h12, h11, h10], dim=1)#(batch_size, 4, hidden_size)
        mean_hidden = concat_hidden.mean(dim=1)#(batch_size, hidden_size)
        """
        mean_cls = (hidden_states[-1][:,0] + hidden_states[-2][:,0] + hidden_states[-3][:,0]) / 3.0
        concat_input = torch.cat([pooler_output, mean_cls], dim=1)
        outputs = self.predict_fc(concat_input)
        return outputs

    def get_mean(self, attention_mask_pad, hidden_state, seq_lens):
        """
          hidden_state: (batch_size, seq_len, hidden_size)
          seq_lens (batch_size, 1)
        """
        hidden_state_real = hidden_state.permute(0, 2, 1) * attention_mask_pad # (batch_size, hidden_size, seq_len)
        hidden_state_real = hidden_state_real.permute(0, 2, 1)# (batch_size, seq_len, hidden_size)
        hidden_state_sum = torch.sum(hidden_state_real, dim=1)# (batch_size, hidden_size)
        return self.div_with_small_value(hidden_state_sum, seq_lens).unsqueeze(1)

    def div_with_small_value(self, a, b, eps=1.0):
        b = b * (b > eps).float() + (b <= eps).float() * 1.0
        return a / b 

    def init_network(self, module):
        if isinstance(module, nn.Linear):
          print(module.__class__.__name__)
          nn.init.xavier_uniform_(module.weight.data)
          nn.init.constant_(module.bias.data, 0.0)


In [0]:
from torch.utils.data import DataLoader, TensorDataset
def train(train_data, dev_data, model, batch_size, num_epochs, model_save_path, lr=0.0001):
  train_inputs, train_outputs = train_data
  train_dataset = TensorDataset(train_inputs, train_outputs)
  start_time = time.time()
  optimizer = optim.Adam(model.parameters(), lr=lr)
  total_batch = 0
  criterion = nn.CrossEntropyLoss()
  dev_per_batch = 500
  dev_best_loss = float('inf')
  last_improve = 0
  require_improvement = 1000
  model.train()
  for epoch in range(num_epochs):
    print('epoch [{}/{}]'.format(epoch + 1, num_epochs))
    for (inputs, labels) in DataLoader(train_dataset, batch_size=batch_size, shuffle=True):
      total_batch += 1
      model.zero_grad() 
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()
      if total_batch % dev_per_batch == 0:
        true_labels = labels.data.cpu()
        predicts = torch.max(outputs.data, dim=1)[1].cpu().numpy()
        train_acc = metrics.accuracy_score(true_labels, predicts)
        time_dif = get_time_dif(start_time)
        dev_acc, dev_loss, report, confusion = evaluate(dev_data, model, batch_size)
        model.train()
        if dev_best_loss > dev_loss:
          dev_best_loss = dev_loss
          improve = '*'
          torch.save(model.state_dict(), model_save_path)
        else:
          improve = ' '
        msg = 'Epoch:{0:>2} Iter: {1:>6},  Train Loss: {2:>5.2},  Train Acc: {3:>6.2%},' \
                      '  Dev Loss: {4:>5.2},  Dev Acc: {5:>6.2%},  Time: {6} {7}'
        print(msg.format(epoch, total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
      
  evaluate(dev_data, model, batch_size)

In [0]:
def evaluate(dev_data, model, batch_size):
  model.eval()
  criterion = nn.CrossEntropyLoss()
  labels_all = np.array([], dtype=int)
  predicts_all = np.array([], dtype=int)
  dev_inputs, dev_outputs = dev_data
  dev_dataset = TensorDataset(dev_inputs, dev_outputs)
  loss_total = 0
  with torch.no_grad():
    for (inputs, labels) in DataLoader(dataset=dev_dataset, batch_size=batch_size, shuffle=False):
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss_total += loss.item()
        predicts = torch.max(outputs.data, dim=1)[1].cpu().numpy()
        labels = labels.data.cpu().numpy()
        predicts_all = np.append(predicts_all, predicts)
        labels_all = np.append(labels_all, labels)
  acc = metrics.accuracy_score(labels_all, predicts_all)
  report = metrics.classification_report(labels_all, predicts_all, digits=4)
  confusion = metrics.confusion_matrix(labels_all, predicts_all)
  return acc, loss_total / len(dev_inputs) * batch_size, report, confusion

In [0]:
def predict(data_x, ids_all, model, batch_size, output_path):
    model.load_state_dict(torch.load('save_model/bert_model'))
    model.eval()
    start_time = time.time()
    predicts_all = []
    for inputs,_ in tqdm(DataLoader(dataset=data_x, batch_size=batch_size, shuffle=False)):
        outputs = model(inputs)
        predicts = list(torch.max(outputs.data, dim=1)[1].cpu().numpy() - 1)
        predicts_all = predicts_all + predicts

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)
    result_pd = pd.DataFrame(
        {
            'id': ids_all,
            'y': predicts_all
        }
    )
    result_pd.to_csv(output_path, index=False)
    print("finish !")

In [0]:
import time
from datetime import timedelta
def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [0]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn import metrics
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

In [17]:
model = RoBertCLSLAST3(bert_model_path, bert_config, 140).to(device)

Linear
Linear


In [0]:
train(train_data, dev_data, model, 16, 10, 'save_model/bert_model', lr=0.00001)

epoch [1/10]
Epoch: 0 Iter:    500,  Train Loss:  0.73,  Train Acc: 56.25%,  Dev Loss:  0.63,  Dev Acc: 72.88%,  Time: 0:06:18 *
Epoch: 0 Iter:   1000,  Train Loss:   1.2,  Train Acc: 62.50%,  Dev Loss:  0.59,  Dev Acc: 74.21%,  Time: 0:15:21 *
Epoch: 0 Iter:   1500,  Train Loss:  0.61,  Train Acc: 75.00%,  Dev Loss:  0.58,  Dev Acc: 74.71%,  Time: 0:24:12 *
Epoch: 0 Iter:   2000,  Train Loss:  0.37,  Train Acc: 81.25%,  Dev Loss:  0.57,  Dev Acc: 74.85%,  Time: 0:33:03 *
Epoch: 0 Iter:   2500,  Train Loss:  0.89,  Train Acc: 62.50%,  Dev Loss:  0.58,  Dev Acc: 74.89%,  Time: 0:41:54  
Epoch: 0 Iter:   3000,  Train Loss:  0.37,  Train Acc: 81.25%,  Dev Loss:  0.57,  Dev Acc: 75.26%,  Time: 0:50:40 *
Epoch: 0 Iter:   3500,  Train Loss:  0.24,  Train Acc: 100.00%,  Dev Loss:  0.59,  Dev Acc: 74.13%,  Time: 0:59:31  
Epoch: 0 Iter:   4000,  Train Loss:  0.57,  Train Acc: 75.00%,  Dev Loss:  0.56,  Dev Acc: 75.54%,  Time: 1:08:17 *
Epoch: 0 Iter:   4500,  Train Loss:  0.47,  Train Acc: 81.

KeyboardInterrupt: ignored

In [0]:
def load_test_data(file_path, tokenizer, max_seq_len, device):
  df = pd.read_csv(file_path)
  df = df[['微博中文内容']]
  inputs = tokenize_data(df, tokenizer, '微博中文内容', max_seq_len, device)
  return inputs

In [19]:
test_input = load_test_data('data/test_weibo_clean.csv', tokenizer, max_seq_len, device)
test_df = pd.read_csv('data/test_weibo_clean.csv')
test_tensor = TensorDataset(test_input, torch.rand(test_input.size()[0]))

100%|██████████| 10000/10000 [00:05<00:00, 1938.24it/s]


In [20]:
predict(test_tensor, test_df['微博id'], model, 16, 'bert_ans.csv')

100%|██████████| 625/625 [02:30<00:00,  4.17it/s]

Time usage: 0:02:30
finish !





In [22]:
!ps -aux

USER         PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
root           1  0.0  0.0  39192  6308 ?        Ss   03:32   0:00 /bin/bash -e 
root           8  0.2  0.3 681620 50280 ?        Sl   03:32   0:01 /tools/node/b
root          18  0.5  0.7 405212 101224 ?       Sl   03:32   0:03 /usr/bin/pyth
root         114  0.0  0.0  35888  4804 ?        Ss   03:33   0:00 tail -n +0 -F
root         122 13.6 44.2 31220748 5899048 ?    Ssl  03:33   1:10 /usr/bin/pyth
root         158  0.0  0.0  18376  1500 ?        S    03:33   0:00 /bin/bash --n
root         159  0.0  0.1 1124572 15288 ?       Sl   03:33   0:00 /opt/google/d
root         160  0.0  0.0  11464  1024 ?        S    03:33   0:00 grep --color=
root         162  1.8  0.5 2495036 70076 ?       Sl   03:33   0:09 /opt/google/d
root         176  0.0  0.0      0     0 ?        Z    03:34   0:00 [fusermount] 
root         212  0.0  0.0  18376  3088 ?        S    03:34   0:00 bash -c tail 
root         213  0.0  0.0   4568 

In [0]:
!kill -9 122