In [1]:
import os
import json
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from src.models.model import SummarizerModel
from src.utils.data_preprocess import convert_examples_to_features
from tqdm.notebook import tqdm


In [2]:
with open(os.path.join('..', 'notebooks', 'labels_corrected.json'), 'r') as json_file:
    data = json.load(json_file)
    labels = {int(k): data[k] for k in data}

In [16]:
p_1 = []
p_2 = []

for k in labels:
    p_1.append(' '.join(labels[k]['Person1']))
    p_2.append(' '.join(labels[k]['Person2']))

In [17]:
p_1[0]

"Mr Smith's getting a check-up."

In [15]:
from difflib import SequenceMatcher

def embed(sents, model):
    embeddings = []
    for s in sents:
        embeddings.append(model.encode(s, convert_to_tensor=True))
        
    return embeddings


def similar(p1_sents, p2_sents):
    model = SentenceTransformer('all-MiniLM-L6-v2').cuda()
    embeddings_p1 = model.encode(p1_sents, convert_to_tensor=True)
    embeddings_p2 = model.encode(p2_sents, convert_to_tensor=True)
    
    cosine_scores = util.cos_sim(embeddings_p1, embeddings_p2)
    
    return cosine_scores.cpu()

In [9]:
cosine_scores = similar(p_1, p_2)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/sentence-transformers/all-MiniLM-L6-v2 HTTP/1.1" 200 1609
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/3746fd5f4cfd46ae64fc781df53e7cbb7849eb62/.gitattributes HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/3746fd5f4cfd46ae64fc781df53e7cbb7849eb62/1_Pooling/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /se

Batches:   0%|          | 0/417 [00:00<?, ?it/s]

Batches:   0%|          | 0/417 [00:00<?, ?it/s]

In [18]:
sims = []
for i in tqdm(range(cosine_scores.shape[0])):
    sims.append(cosine_scores[i, i].item())

  0%|          | 0/13324 [00:00<?, ?it/s]

In [21]:
with open('predictions.json', 'r') as json_file:
    predictions = json.load(json_file)

In [28]:
pred_1 = []
pred_2 = []

for k in predictions:
    pred_1.append(predictions[k]['Person1'])
    pred_2.append(predictions[k]['Person2'])

In [31]:
cosine_scores_preds = similar(pred_1, pred_2)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/sentence-transformers/all-MiniLM-L6-v2 HTTP/1.1" 200 1609
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/3746fd5f4cfd46ae64fc781df53e7cbb7849eb62/.gitattributes HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/3746fd5f4cfd46ae64fc781df53e7cbb7849eb62/1_Pooling/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /se

Batches:   0%|          | 0/587 [00:00<?, ?it/s]

Batches:   0%|          | 0/587 [00:00<?, ?it/s]

In [35]:
pred_sims = []
for i in tqdm(range(cosine_scores_preds.shape[0])):
    pred_sims.append(cosine_scores_preds[i, i].item())

  0%|          | 0/18759 [00:00<?, ?it/s]

In [44]:
sim_df = pd.DataFrame({'sim_label': sims})
sim_pred_df = pd.DataFrame({'sim_pred': pred_sims})
sim_df.describe()

Unnamed: 0,sim_label
count,13324.0
mean,0.778744
std,0.244085
min,-0.115191
25%,0.56211
50%,0.890051
75%,1.0
max,1.000001


In [43]:
sim_pred_df.describe()

Unnamed: 0,sim_pred
count,18759.0
mean,1.0
std,2.565309e-07
min,0.999999
25%,0.9999999
50%,1.0
75%,1.0
max,1.000001


In [6]:
dists = []
for p1, p2 in zip(p_1, p_2):
    s1 = ''.join(p1)
    s2 = ''.join(p2)
    dists.append(similar(p1, p2))

In [7]:
mean_distance = sum(dists) / len(dists)

In [4]:
model_name = "Salesforce/bart-large-xsum-samsum"
#model_name = 'facebook/bart-large-xsum'
params = {
    'model_name': model_name,
    'load_path': None,
    'add_module_loss': None,
    'add_functurn_loss': None
}

model = SummarizerModel(params)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /Salesforce/bart-large-xsum-samsum/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /Salesforce/bart-large-xsum-samsum/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /Salesforce/bart-large-xsum-samsum/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /Salesforce/bart-large-xsum-samsum/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:

In [9]:
weights_path = os.path.join('models', 'penalty_salesforce_2', 'best_pytorch.bin')
model.load_state_dict(torch.load(weights_path))


<All keys matched successfully>

In [10]:
dialogsum_path = os.path.join('data', 'processed', 'new_dialogsum_shuffled')
with open(os.path.join(dialogsum_path, 'test.json'), 'r') as test, open(os.path.join(dialogsum_path, 'train.json'), 'r') as train, open(os.path.join(dialogsum_path, 'eval.json'), 'r') as val:
    test_data = json.load(test)
    train_data = json.load(train)
    val_data = json.load(val)
    

In [11]:
total_data = test_data + train_data + val_data
len(total_data)

13324

In [5]:
with open(os.path.join(dialogsum_path, 'total.json'), 'w') as file:
    json.dump(total_data, file, indent=4)

NameError: name 'dialogsum_path' is not defined

In [13]:
from src.utils.data_preprocess import load_examples, convert_examples_to_features

val_path = os.path.join(dialogsum_path, 'total.json')
batch_size = 1

class Args:
    do_segment = True
    do_train = True
    output_dir='models/save/penalty_mini_dialogsum_test_3'
    use_pred_segment = False
    #train_file_path = train_path
    dev_file_path = val_path
    oracle_functurn_context = False
    source_max_len = 512
    gen_keyphrase_summary = True
    target_max_len = 50
    add_module_loss = False
    add_functurn_loss = False
    train_batch_size = batch_size
    gradient_accumulation_steps = 1
    num_train_epochs = 30
    warmup_proportion = 0.1
    patience = 8
    model_name = 'mini_dialogsum_1'
    max_grad_norm = 1.0
    validation_timing = 1
    eval_batch_size = batch_size
    no_repeat_ngram_size = 0
    beam = 4
    test_target_max_len = 50
    wandb = False
    learning_rate = 5e-5
    adam_epsilon = 1e-8
    seed = 42
    penalty_term = 1
    k_fold_cross_validation = False
    
args = Args()
dev_examples = load_examples(args, args.dev_file_path)
dev_features = convert_examples_to_features(args, model.config, model.tokenizer, dev_examples)
dev_data = (dev_examples, dev_features)

Examples: 100%|██████████| 22302/22302 [00:32<00:00, 681.32it/s]

[INFO] max_target_len 0





In [14]:
from src.models.evaluate import predict
model.cuda()
preds = predict(args, model, dev_data)


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
Generating: 100%|██████████| 22302/22302 [5:54:44<00:00,  1.05it/s]  


In [15]:
predictions = preds[0]

In [18]:
len(predictions), len(dev_examples)

(18759, 22302)

In [26]:
for k in predictions:
    p_1 = predictions[k]['Person1']
    p_2 = predictions[k]['Person2']
    if p_1 != p_2:
        print(k)
        break

In [27]:
with open('predictions.json', 'w') as json_file:
    json.dump(predictions, json_file, indent=4)

In [7]:
with open('predictions.json', 'r') as json_file:
    data = json.load(json_file)

In [9]:
data[max(data)]

{'Person1': "#Person1# wants to know if they have the same style in white, but they don't have any on hand today. #Person2# invites her to come on Friday.",
 'Person2': "#Person1# wants to know if they have the same style in white, but they don't have any on hand today. #Person2# invites her to come on Friday."}

In [13]:
with open('./models/penalty_salesforce_2/test.pred.summary.json', 'r') as file:
    data = json.load(file)
    data = {int(k): v for k, v in data.items()}

In [18]:
preds_1 = []
preds_2 = []
for k, v in data.items():
    preds_1.append(v['Person1'])
    preds_2.append(v['Person2'])

In [19]:
cosine_scores_preds = similar(preds_1, preds_2)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/sentence-transformers/all-MiniLM-L6-v2 HTTP/1.1" 200 1609
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/3746fd5f4cfd46ae64fc781df53e7cbb7849eb62/.gitattributes HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/3746fd5f4cfd46ae64fc781df53e7cbb7849eb62/1_Pooling/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /se

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [21]:
test_pred_sims = []
for i in tqdm(range(cosine_scores_preds.shape[0])):
    test_pred_sims.append(cosine_scores_preds[i, i].item())

  0%|          | 0/496 [00:00<?, ?it/s]

In [22]:
sim_test_pred_df = pd.DataFrame({'test_sim_pred': test_pred_sims})
sim_test_pred_df.describe()

Unnamed: 0,test_sim_pred
count,496.0
mean,0.83045
std,0.254774
min,-0.006409
25%,0.787907
50%,0.946725
75%,1.0
max,1.000001
