In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 29.0 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 74.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 61.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1


In [2]:
import argparse
import random
from transformers import BertTokenizer
import re
import torch
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForNextSentencePrediction, AdamW, BertConfig
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup
import torch.nn as nn


def MarginRankingLoss(p_scores, n_scores):
    margin = 1
    scores = margin - p_scores + n_scores
    scores = scores.clamp(min=0)

    return scores.mean()

device = 0
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

sample_num_memory = []
id_inputs = []

#for line in open('/Users/linzi/Desktop/dialogue_test/training_data/dailydial/dailydial_sample_num.txt'):
for line in open('/content/drive/MyDrive/PA NLP/input_txt/dailydial_sample_num.txt',encoding="utf-8"):
    line = line.strip()
    sample_num_memory.append(int(line))

#for line in open('/Users/linzi/Desktop/dialogue_test/training_data/dailydial/dailydial_pairs.txt'):
for line in open('/content/drive/MyDrive/PA NLP/input_txt/dailydial_pairs.txt',encoding="utf-8"):
    line = line.strip().split('\t\t')
    sent1 = line[0]
    sent2 = line[1]
    encoded_sent1 = tokenizer.encode(sent1, add_special_tokens = True, max_length = 128, return_tensors = 'pt')
    encoded_sent2 = tokenizer.encode(sent2, add_special_tokens = True, max_length = 128, return_tensors = 'pt')
    encoded_pair = encoded_sent1[0].tolist() + encoded_sent2[0].tolist()[1:]
    id_inputs.append(torch.Tensor(encoded_pair))

print('Max sentence length: ', max([len(sen) for sen in id_inputs]))

MAX_LEN = 256
print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
id_inputs = pad_sequences(id_inputs, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

attention_masks = []
for sent in id_inputs:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)

# group samples .....
grouped_inputs = []; grouped_masks = []
count = 0
for i in sample_num_memory:
    grouped_inputs.append(id_inputs[count: count+i])
    grouped_masks.append(attention_masks[count: count+i])
    count = count + i
print('The group number is: '+ str(len(grouped_inputs)))
# generate pos/neg pairs ....
print('start generating pos and neg pairs ... ')
pos_neg_pairs = []; pos_neg_masks = []
for i in range(len(grouped_inputs)):
    if len(grouped_inputs[i]) == 2:
        pos_neg_pairs.append(grouped_inputs[i])
        pos_neg_masks.append(grouped_masks[i])
    else:
        pos_neg_pairs.append([grouped_inputs[i][0], grouped_inputs[i][1]])
        pos_neg_pairs.append([grouped_inputs[i][0], grouped_inputs[i][2]])
        pos_neg_pairs.append([grouped_inputs[i][1], grouped_inputs[i][2]])
        pos_neg_masks.append([grouped_masks[i][0], grouped_masks[i][1]])
        pos_neg_masks.append([grouped_masks[i][0], grouped_masks[i][2]])
        pos_neg_masks.append([grouped_masks[i][1], grouped_masks[i][2]])

print('there are '+str(len(pos_neg_pairs))+' samples been generated...')
fake_labels = [0]*len(pos_neg_pairs)

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(pos_neg_pairs, fake_labels, random_state=2018, test_size=0.8)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(pos_neg_masks, fake_labels, random_state=2018, test_size=0.8)

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

batch_size = 12
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

coherence_prediction_decoder = []
coherence_prediction_decoder.append(nn.Linear(768, 768))
coherence_prediction_decoder.append(nn.ReLU())
coherence_prediction_decoder.append(nn.Dropout(p=0.1))
coherence_prediction_decoder.append(nn.Linear(768, 2))
coherence_prediction_decoder = nn.Sequential(*coherence_prediction_decoder)
coherence_prediction_decoder.to(device)

model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = True)
model.cuda(device)
optimizer = AdamW(list(model.parameters())+list(coherence_prediction_decoder.parameters()), lr = 2e-5, eps = 1e-8)

epochs = 10
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)


for epoch_i in range(0, epochs):

    total_loss = 0

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_loss = 0

    model.train()
    coherence_prediction_decoder.train()

    for step, batch in enumerate(train_dataloader):

        if step % 1000 == 0 and not step == 0:
            print(str(step)+' steps done....')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        model.zero_grad()
        coherence_prediction_decoder.zero_grad()

        pos_scores = model(b_input_ids[:,0,:], attention_mask=b_input_mask[:,0,:])
        pos_scores = pos_scores[1][-1][:,0,:]
        pos_scores = coherence_prediction_decoder(pos_scores)

        neg_scores = model(b_input_ids[:,1,:], attention_mask=b_input_mask[:,1,:])
        neg_scores = neg_scores[1][-1][:,0,:]
        neg_scores = coherence_prediction_decoder(neg_scores)

        #loss = MarginRankingLoss(pos_scores[0][:,0], neg_scores[0][:,0])
        loss = MarginRankingLoss(pos_scores[:,0], neg_scores[:,0])

        total_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(list(model.parameters())+list(coherence_prediction_decoder.parameters()), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print('=========== the loss for epoch '+str(epoch_i)+' is: '+str(avg_train_loss))

    print("")
    print("Running Validation...")

    model.eval()
    coherence_prediction_decoder.eval()

    all_pos_scores = []
    all_neg_scores = []

    for step, batch in enumerate(validation_dataloader):

        if step % 1000 == 0 and not step == 0:
            print(str(step)+' steps done....')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)

        with torch.no_grad():
            pos_scores = model(b_input_ids[:,0,:], attention_mask=b_input_mask[:,0,:])
            pos_scores = pos_scores[1][-1][:,0,:]
            pos_scores = coherence_prediction_decoder(pos_scores)
            neg_scores = model(b_input_ids[:,1,:], attention_mask=b_input_mask[:,1,:])
            neg_scores = neg_scores[1][-1][:,0,:]
            neg_scores = coherence_prediction_decoder(neg_scores)

        #all_pos_scores += pos_scores[0][:,0].detach().cpu().numpy().tolist()
        #all_neg_scores += neg_scores[0][:,0].detach().cpu().numpy().tolist()
        all_pos_scores += pos_scores[:,0].detach().cpu().numpy().tolist()
        all_neg_scores += neg_scores[:,0].detach().cpu().numpy().tolist()

    labels = []

    for i in range(len(all_pos_scores)):
        if all_pos_scores[i] > all_neg_scores[i]:
            labels.append(1)
        else:
            labels.append(0)

    print(sum(labels)/float(len(all_pos_scores)))

    PATH = '/content/drive/MyDrive/PA NLP/all_model/torch_saved_bert_'+str(epoch_i)
    torch.save(model.state_dict(), PATH)

    model.save_pretrained('/content/drive/MyDrive/PA NLP/all_model/model_pretrained_bert_'+str(epoch_i)+'/')
    tokenizer.save_pretrained('/content/drive/MyDrive/PA NLP/all_model/tokenizer_pretrained_bert_'+str(epoch_i)+'/')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Loading BERT tokenizer...


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


KeyboardInterrupt: ignored

In [3]:
!pip install segeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting segeval
  Downloading segeval-2.0.11.tar.gz (39 kB)
Building wheels for collected packages: segeval
  Building wheel for segeval (setup.py) ... [?25l[?25hdone
  Created wheel for segeval: filename=segeval-2.0.11-py3-none-any.whl size=60284 sha256=9e81d21f81773caa7a5f4f96bcd46dc5c19af0d77d4920e12947dbdf7ff4bc1c
  Stored in directory: /root/.cache/pip/wheels/a3/d4/b7/15ed9a8c1c816a749e3b40b00a1c905d95ec3f362098192010
Successfully built segeval
Installing collected packages: segeval
Successfully installed segeval-2.0.11


In [10]:
torch.cuda.empty_cache()

In [11]:
import os
import numpy as np
from numpy import random as np_random
#import random
import copy
import itertools
from os import listdir
from os.path import isfile, join
import shutil
import segeval
import re
from transformers import BertTokenizer
import torch
from keras.preprocessing.sequence import pad_sequences
from transformers import BertForNextSentencePrediction
import statistics
from sklearn.metrics import mean_absolute_error, f1_score

def depth_score_cal(scores):
	output_scores = []
	for i in range(len(scores)):
		lflag = scores[i]; rflag = scores[i];
		if i == 0:
			hl = scores[i]
			for r in range(i+1,len(scores)):
				if rflag <= scores[r]:
					rflag = scores[r]
				else:
					break
		elif i == len(scores):
			hr = scores[i]
			for l in range(i-1, -1, -1):
				if lflag <= scores[l]:
					lflag = scores[l]
				else:
					break
		else:
			for r in range(i+1,len(scores)):
				if rflag <= scores[r]:
					rflag = scores[r]
				else:
					break
			for l in range(i-1, -1, -1):
				if lflag <= scores[l]:
					lflag = scores[l]
				else:
					break
		depth_score = 0.5*(lflag+rflag-2*scores[i])
		output_scores.append(depth_score)

	return output_scores



device = 0
MODEL_PATH = '/content/drive/MyDrive/PA NLP/all_model/model_pretrained_bert_0'
#MODEL_PATH = 'bert-base-uncased'
model = BertForNextSentencePrediction.from_pretrained(MODEL_PATH, num_labels = 2, output_attentions = False, output_hidden_states = False)
model.cuda(device)
'''
MODEL_PATH = '/scratch/linzi/bert_9'
model.load_state_dict(torch.load(MODEL_PATH ,map_location=device))
'''
model.eval()

# path_input_docs = '/ubc/cs/research/nlp/Linzi/dailydial/doc2dial_data/'
# input_files = [f for f in listdir(path_input_docs) if isfile(join(path_input_docs, f))]
input_files = ['/content/drive/MyDrive/PA NLP/input_txt/dialogues_text.txt']
print('Loading BERT tokenizer...')
# tokenizer = BertTokenizer.from_pretrained(MODEL_PATH, do_lower_case=True)
tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/PA NLP/all_model/tokenizer_pretrained_bert_0", do_lower_case=True)

c = 0
pick_num = 3
score_wd = 0; score_mae = 0; score_f1 = 0; score_pk = 0;
dp_var = []

for file in input_files:

	if file not in ['.DS_Store', '196']:
	#if file not in ['.DS_Store']:
		print('*********** The current file is : '+ file + '***********')
		text = []
		id_inputs = []
		depth_scores = []
		seg_r_labels = []; seg_r = [];
		tmp = 0
		for line in open(file):
			if '================' not in line.strip():
				text.append(line.strip())
				seg_r_labels.append(0)
				tmp += 1
			else:
				seg_r_labels[-1] = 1
				seg_r.append(tmp)
				tmp = 0
				
		seg_r.append(tmp)

		for i in range(len(text)-1):
			sent1 = text[i]
			sent2 = text[i+1]
			encoded_sent1 = tokenizer.encode(sent1, add_special_tokens = True, max_length = 128, return_tensors = 'pt')
			encoded_sent2 = tokenizer.encode(sent2, add_special_tokens = True, max_length = 128, return_tensors = 'pt')
			encoded_pair = encoded_sent1[0].tolist() + encoded_sent2[0].tolist()[1:]
			id_inputs.append(torch.Tensor(encoded_pair))

		MAX_LEN = 256
		id_inputs = pad_sequences(id_inputs, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
		attention_masks = []
		for sent in id_inputs:
			att_mask = [int(token_id > 0) for token_id in sent]
			attention_masks.append(att_mask)

		test_inputs = torch.tensor(id_inputs).to(device)
		test_masks = torch.tensor(attention_masks).to(device)

		scores = model(test_inputs, attention_mask=test_masks)
		scores = torch.sigmoid(scores[0][:,0]).detach().cpu().numpy().tolist()

		depth_scores = depth_score_cal(scores)
		#print(depth_scores)

		#boundary_indice = np.argsort(np.array(depth_scores))[-pick_num:]
	
		threshold = sum(depth_scores)/(len(depth_scores))-0.1*statistics.stdev(depth_scores)
		dp_var.append(statistics.stdev(depth_scores))
		boundary_indice = []
	
		seg_p_labels = [0]*(len(depth_scores)+1)
		
		for i in range(len(depth_scores)):
			if depth_scores[i] > threshold:
				boundary_indice.append(i)
		
		for i in boundary_indice:
			seg_p_labels[i] = 1

		tmp = 0; seg_p = []
		for fake in seg_p_labels:
			if fake == 1:
				tmp += 1
				seg_p.append(tmp)
				tmp = 0
			else:
				tmp += 1
		seg_p.append(tmp)

		#print(depth_scores)
		#print(threshold)
		#print(seg_p)
		#print(seg_r)

		score_wd += segeval.window_diff(seg_p, seg_r)
		score_pk += segeval.pk(seg_p, seg_r)
		score_mae += sum(list(map(abs, np.array(seg_r_labels)-np.array(seg_p_labels))))
		score_f1 += f1_score(seg_r_labels, seg_p_labels, labels = [0,1], average='macro')
		print(c)
		print(seg_r_labels)
		print(seg_p_labels)
		c += 1

print(c)
print('pk: ', score_pk/c)
print('wd: ', score_wd/c)
print('mae: ', score_mae/c)
print('f1: ', score_f1/c)
print('dp variance: ', sum(dp_var)/c)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Loading BERT tokenizer...
*********** The current file is : /content/drive/MyDrive/PA NLP/input_txt/dialogues_text.txt***********


RuntimeError: ignored