In [169]:
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertAdam
import numpy as np
from tqdm import tqdm
import pandas as pd
file = pd.read_csv('quora.tsv', '\t')

In [208]:
def convert_lines(sentence, max_seq_length, tokenizer):
    max_seq_length -=2
    all_tokens = []
    for text in tqdm(sentence):
        tokens = tokenizer.tokenize(text)
        if len(tokens)>max_seq_length:
            tokens = tokens[:max_seq_length]
        padding_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens+["[SEP]"])+[0] * (max_seq_length - len(tokens))
        all_tokens.append(padding_token)
    return np.array(all_tokens)

In [209]:
max_sequence_length = 200
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir=None, do_lower_case=True)
#file['question1'] = file['question1'].astype(str) 
question1 = convert_lines(file["question1"], max_sequence_length, tokenizer)
#file['question2'] = file['question2'].astype(str)
question2 = convert_lines(file["question2"], max_sequence_length, tokenizer)
label = np.array(file['is_duplicate']) 

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/LYB/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
















  0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















100%|██████████| 10/10 [00:00<00:00, 3218.71it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















  0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















100%|██████████| 10/10 [00:00<00:00, 3012.93it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

In [210]:
import torch.utils.data.dataset as Dataset
import torch.utils.data.dataloader as DataLoader
#创建子类
class modelDataset(Dataset.Dataset):
    #初始化，定义数据内容和标签
    def __init__(self, data1, data2, label):
        self.data1 = data1
        self.data2 = data2
        self.label = label
    #返回数据集大小
    def __len__(self):
        return len(self.label)
    #得到数据内容和标签
    def __getitem__(self, index):
        data1 = torch.LongTensor(self.data1[index])
        data2 = torch.LongTensor(self.data2[index])
        label = torch.Tensor([self.label[index]])
        if torch.cuda.is_available():
            data1 = data1.cuda()
            data2 = data2.cuda()
            label = label.cuda()
        return [data1, data2], label 

In [211]:
dataset = modelDataset(question1, question2, label)

In [218]:
class matchingModel(nn.Module):
    def __init__(self, pre_trained_model):
        super(matchingModel, self).__init__()
        
        self.model_query = BertModel.from_pretrained(pre_trained_model)
        self.model_bidword = BertModel.from_pretrained(pre_trained_model)
        self.dropout = nn.Dropout(p = 0.2)
        self.fc1 = nn.Linear(768*4, 256)
        self.fc2 = nn.Linear(256, 1) 
        
    def infer_metrics(self, x1, x2):
        return torch.cat((x1, x2, torch.abs(x1 - x2), x1 * x2), 1)
    
    def forward(self, input_data):
        
        query_vec = self.model_query(input_data[0])
        bidword_vec = self.model_bidword(input_data[1])
#         similarity_scores = self.infer_metrics(query_vec[1], bidword_vec[1])
#         similarity_scores = self.dropout(torch.tanh(self.fc1(similarity_scores)))
#         similarity_scores = self.fc2(similarity_scores)

        # global average pooling
#         avg_pool_query = torch.mean(query_vec[0][0], 1)
#         avg_pool_bidword = torch.mean(bidword_vec[0][0], 1)
#         # global max pooling
#         max_pool_query, _ = torch.max(query_vec[0][0], 1)
#         max_pool_bidword, _ = torch.max(bidword_vec[0][0], 1)
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        output = cos(query_vec[1], bidword_vec[1])
        
        return output 
 

In [223]:
lr=2e-5
batch_size = 2
accumulation_steps=2
save_steps = 1000
checkpoint = None
EPOCHS = 10
num_train_optimization_steps = int(EPOCHS*len(label) / batch_size / accumulation_steps)
optimizer = BertAdam(model.parameters(),
                     lr=lr,  
                     warmup=0.05,   
                     t_total=num_train_optimization_steps)

model = matchingModel('bert-base-uncased')
model.train() 
criterion = nn.BCELoss()
for epoch in tqdm(range(EPOCHS)): 
#     file_name = 'loss_log_' + 'epoch' + str(epoch) + '.txt'
#     file = open(file_name, 'w', encoding='utf-8')
    train_loader = DataLoader.DataLoader(dataset, batch_size= batch_size, shuffle = True) 
    avg_loss = 0
    avg_accuracy = 0
    optimizer.zero_grad()   
    for x_batch, y_batch in train_loader:
        y_pred = model(x_batch)
        loss = criterion(y_pred, y_batch)  
        loss.backward()  
        optimizer.step() 
        optimizer.zero_grad()
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean(((y_pred>0.5) == (y_batch>0.5)).to(torch.float)).item()/len(train_loader)
#         i += 1
#         file.write('batch' + str(i) + '\t' + 'avg_loss' + '=' + str(avg_loss) + '\t' + 'avg_accuracy' + '=' + str(avg_accuracy) + '\n')
#     file.close()
#     file_path = output_model_file + str(epoch) +'.bin'
#     torch.save(model.state_dict(), file_path)
    print(f'loss:{avg_loss} accuracy:{avg_accuracy}') 

INFO:pytorch_pretrained_bert.modeling:loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/LYB/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
INFO:pytorch_pretrained_bert.modeling:extracting archive file /Users/LYB/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/r0/q62_cr2d4yb7h48zdx8tdrs00000gp/T/tmpsx01fuko
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:pytorch_pretr

loss:2.668728470802307 accuracy:0.2






















 20%|██        | 2/10 [00:44<02:58, 22.27s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

loss:2.700250768661499 accuracy:0.2






















 30%|███       | 3/10 [01:07<02:36, 22.42s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

loss:2.950053668022156 accuracy:0.2






















 40%|████      | 4/10 [01:29<02:14, 22.50s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

loss:2.637410807609558 accuracy:0.2






















 50%|█████     | 5/10 [01:52<01:52, 22.57s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

loss:2.76489737033844 accuracy:0.2






















 60%|██████    | 6/10 [02:13<01:28, 22.22s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

loss:2.625834274291992 accuracy:0.2


KeyboardInterrupt: 