# 1. Dataset Loading and Environment Setup

In [1]:
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


In [2]:
!gdown 18d8LMm61ddRcHLvbncBJjxkA4LijFT-k
!gdown 1dY2Rvck-rsmyuqrciHLsteRurfptK9De
!gdown 1DI9mDSCfOZUc6FIwSdIo7KQPXmz3tYAb

Downloading...
From: https://drive.google.com/uc?id=18d8LMm61ddRcHLvbncBJjxkA4LijFT-k
To: /content/train.csv
100% 6.25M/6.25M [00:00<00:00, 21.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1dY2Rvck-rsmyuqrciHLsteRurfptK9De
To: /content/dev.csv
100% 816k/816k [00:00<00:00, 131MB/s]
Downloading...
From: https://drive.google.com/uc?id=1DI9mDSCfOZUc6FIwSdIo7KQPXmz3tYAb
To: /content/test.csv
100% 777k/777k [00:00<00:00, 137MB/s]


In [3]:
%%capture
!pip install datasets
!pip install transformers
!pip install --upgrade accelerate
!pip install evaluate
!pip install datsets transformers[sentencepiece]
!pip install sentencepiece
!pip install rouge-score
!pip install bert_score
!python -m spacy download en_core_web_lg

In [30]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel, T5Tokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import csv
from collections import defaultdict
from rouge_score import rouge_scorer
from evaluate import load
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy
from tqdm.notebook import tqdm

import tensorflow_hub as hub
from tensorflow import keras
import tensorflow as tf
from IPython.display import Image
import matplotlib.pyplot as plt


nlp = spacy.load('en_core_web_lg')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device_id = 0 if str(device) == 'cuda' else -1

In [6]:
train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')

# 2. Creating a Custom Dataset

In [7]:
def get_dicts(df, folder="test"):
  sents_dict = {}
  doc_dict = { i: {"article": df.reviewText[i], "highlight": df.summary[i]} for i in df.index }
  raw_docs = [ doc_dict[k]["article"] for k in doc_dict.keys()]

  doc_sents = {}
  sents_list = []
  raw_sents = []
  i = 0
  min_sent_length = 0
  for k in tqdm(doc_dict.keys()):
    article = doc_dict[k]["article"]
    highlight = doc_dict[k]["highlight"]
    sents = nlp(article).sents
    doc_sent_ids = []
    for sent in sents:
      if (len(sent)) > min_sent_length:
        sents_dict[i] = {"docid":k, "text": str(sent)}
        sents_list.append({"sentid":i, "docid":k, "text": str(sent) })
        raw_sents.append(str(sent))
        i += 1

  return doc_dict, sents_list

test_doc_dict, test_sents_list = get_dicts(test_df)
dev_doc_dict, dev_sents_list = get_dicts(dev_df)
train_doc_dict, train_sents_list = get_dicts(train_df)

  0%|          | 0/1388 [00:00<?, ?it/s]

  0%|          | 0/1387 [00:00<?, ?it/s]

  0%|          | 0/11095 [00:00<?, ?it/s]

In [8]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def get_rougue_score(text, highlights, metric="rougeL"):
  max_score = 0
  for h_text in highlights:
    score =  scorer.score(text, h_text)[metric].fmeasure
    # print(score, text, "\n \t" , h_text)
    if score > max_score:
      max_score = score
  return max_score


def get_label(sent, doc_dict,  score_threshold = 0.55):
  sent_id, doc_id, sentence = sent["sentid"], sent["docid"], sent["text"]
  highlights = doc_dict[doc_id]["highlight"].split("\n")
  doc = doc_dict[doc_id]["article"]

  label_score = get_rougue_score(sentence, highlights)
  # Normalize label to 0/1 based on rogue score threshold
  label_score = 0 if label_score < score_threshold else 1
  return (sentence, doc, label_score)

def sub_sample(sents_batch, doc_dict, neg_multiplier=2):
  # get labels
  vals = [get_label(x, doc_dict)  for x in sents_batch]

  # construct arrays of sentences, corresponding documents and labels
  sents, docs, y = [], [], []
  for row in vals:
    sents.append(row[0])
    docs.append(row[1])
    y.append(row[2])


  # get balanced number of positive and negative
  sub_df = pd.DataFrame.from_dict({"sents":sents, "docs":docs, "y":y})
  pos_df = sub_df[sub_df.y == 1]
  neg_df = sub_df[sub_df.y == 0]

  print("Negative sample size:", len(neg_df))
  print("Positive sample size:", len(pos_df))

  sub_neg_df = neg_df.sample(len(pos_df)*neg_multiplier, replace=True)
  balanced_df = pos_df.append(sub_neg_df)

  return balanced_df

In [9]:
train_bdf = sub_sample(train_sents_list, train_doc_dict)
test_bdf = sub_sample(test_sents_list, test_doc_dict)

Negative sample size: 58795
Positive sample size: 2154


  balanced_df = pos_df.append(sub_neg_df)


Negative sample size: 7311
Positive sample size: 269


  balanced_df = pos_df.append(sub_neg_df)


In [10]:
sentenc_model_name = "sentence-transformers/paraphrase-MiniLM-L3-v2"
tokenizer = AutoTokenizer.from_pretrained(sentenc_model_name)

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [11]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
EPOCHS = 3
LEARNING_RATE = 1e-05

In [12]:
# Create a Data Loader Class
class AmazonReviewData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = str(self.data.iloc[index].sents)
        sentence = " ".join(sentence.split())

        document = str(self.data.iloc[index].docs)
        document = " ".join(document.split())

        inputs = self.tokenizer.batch_encode_plus(
            [sentence, document],
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'sent_ids': torch.tensor(ids[0], dtype=torch.long),
            'doc_ids': torch.tensor(ids[1], dtype=torch.long),
            'sent_mask': torch.tensor(mask[0], dtype=torch.long),
            'doc_mask': torch.tensor(mask[1], dtype=torch.long),
            'targets': torch.tensor([self.data.iloc[index].y], dtype=torch.long)
        }

    def __len__(self):
        return self.len



training_set = AmazonReviewData(train_bdf, tokenizer, MAX_LEN)
testing_set = AmazonReviewData(test_bdf, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# 3. Model Training

In [13]:
# get mean pooling for sentence bert models
# ref https://www.sbert.net/examples/applications/computing-embeddings/README.html#sentence-embeddings-with-transformers
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.
# Note that different sentence transformer models may have different in_feature sizes
class SentenceBertClass(torch.nn.Module):
    def __init__(self, model_name="sentence-transformers/paraphrase-MiniLM-L3-v2", in_features=384):
        super(SentenceBertClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(model_name)
        self.pre_classifier = torch.nn.Linear(in_features*3, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)
        self.classifierSigmoid = torch.nn.Sigmoid()

    def forward(self, sent_ids, doc_ids, sent_mask, doc_mask):

        sent_output = self.l1(input_ids=sent_ids, attention_mask=sent_mask)
        sentence_embeddings = mean_pooling(sent_output, sent_mask)

        doc_output = self.l1(input_ids=doc_ids, attention_mask=doc_mask)
        doc_embeddings = mean_pooling(doc_output, doc_mask)

        # elementwise product of sentence embs and doc embs
        combined_features = sentence_embeddings * doc_embeddings

        # Concatenate input features and their elementwise product
        concat_features = torch.cat((sentence_embeddings, doc_embeddings, combined_features), dim=1)

        pooler = self.pre_classifier(concat_features)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.classifierSigmoid(output)

        return output


In [14]:
model = SentenceBertClass(model_name=sentenc_model_name)
model.to(device);

loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

pytorch_model.bin:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

In [19]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model
print_n_steps = 1000
EPOCHS = 20
acc_step_holder, loss_step_holder = [], []


def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        sent_ids = data['sent_ids'].to(device, dtype = torch.long)
        doc_ids = data['doc_ids'].to(device, dtype = torch.long)
        sent_mask = data['sent_mask'].to(device, dtype = torch.long)
        doc_mask = data['doc_mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(sent_ids, doc_ids, sent_mask, doc_mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        n_correct += torch.count_nonzero(targets == (outputs > 0.5)).item()

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%print_n_steps==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(str(_* train_params["batch_size"]) + "/" + str(len(train_df)) + " - Steps. Acc ->", accu_step, "Loss ->", loss_step)
            acc_step_holder.append(accu_step), loss_step_holder.append(loss_step)
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [20]:
torch.cuda.empty_cache()
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 62.5 Loss -> 0.688361406326294
The Total Accuracy for Epoch 0: 66.92974311358712
Training Loss Epoch: 0.6246822677036323
Training Accuracy Epoch: 66.92974311358712


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 68.75 Loss -> 0.5534501075744629
The Total Accuracy for Epoch 1: 76.6016713091922
Training Loss Epoch: 0.4810905949314042
Training Accuracy Epoch: 76.6016713091922


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 82.8125 Loss -> 0.42569389939308167
The Total Accuracy for Epoch 2: 79.10863509749304
Training Loss Epoch: 0.4395910811306226
Training Accuracy Epoch: 79.10863509749304


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 76.5625 Loss -> 0.4383830428123474
The Total Accuracy for Epoch 3: 80.08356545961003
Training Loss Epoch: 0.42505560682551696
Training Accuracy Epoch: 80.08356545961003


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 76.5625 Loss -> 0.47584739327430725
The Total Accuracy for Epoch 4: 80.90374497059734
Training Loss Epoch: 0.4122382130953345
Training Accuracy Epoch: 80.90374497059734


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 89.0625 Loss -> 0.2853572964668274
The Total Accuracy for Epoch 5: 81.5072732900031
Training Loss Epoch: 0.39553009253917354
Training Accuracy Epoch: 81.5072732900031


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 79.6875 Loss -> 0.32671940326690674
The Total Accuracy for Epoch 6: 82.7143299288146
Training Loss Epoch: 0.37910335695389474
Training Accuracy Epoch: 82.7143299288146


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 71.875 Loss -> 0.48203572630882263
The Total Accuracy for Epoch 7: 83.54998452491489
Training Loss Epoch: 0.3642522180729573
Training Accuracy Epoch: 83.54998452491489


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 81.25 Loss -> 0.3331397771835327
The Total Accuracy for Epoch 8: 85.0974930362117
Training Loss Epoch: 0.3450246751308441
Training Accuracy Epoch: 85.0974930362117


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 79.6875 Loss -> 0.3691016733646393
The Total Accuracy for Epoch 9: 85.73197152584339
Training Loss Epoch: 0.3289942413863569
Training Accuracy Epoch: 85.73197152584339


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 76.5625 Loss -> 0.36919665336608887
The Total Accuracy for Epoch 10: 86.28907458991024
Training Loss Epoch: 0.3079351406581331
Training Accuracy Epoch: 86.28907458991024


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 93.75 Loss -> 0.22321084141731262
The Total Accuracy for Epoch 11: 88.14608480346642
Training Loss Epoch: 0.28226567745798886
Training Accuracy Epoch: 88.14608480346642


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 92.1875 Loss -> 0.23208042979240417
The Total Accuracy for Epoch 12: 89.322191272052
Training Loss Epoch: 0.25790415556714086
Training Accuracy Epoch: 89.322191272052


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 93.75 Loss -> 0.2099505364894867
The Total Accuracy for Epoch 13: 90.90064995357474
Training Loss Epoch: 0.23630074640311818
Training Accuracy Epoch: 90.90064995357474


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 93.75 Loss -> 0.192084401845932
The Total Accuracy for Epoch 14: 91.70535437944909
Training Loss Epoch: 0.2154780300979567
Training Accuracy Epoch: 91.70535437944909


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 95.3125 Loss -> 0.15593311190605164
The Total Accuracy for Epoch 15: 93.0052615289384
Training Loss Epoch: 0.19299974026951458
Training Accuracy Epoch: 93.0052615289384


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 89.0625 Loss -> 0.205739825963974
The Total Accuracy for Epoch 16: 93.51593933766635
Training Loss Epoch: 0.1763280515771101
Training Accuracy Epoch: 93.51593933766635


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 95.3125 Loss -> 0.1596578061580658
The Total Accuracy for Epoch 17: 94.53729495512225
Training Loss Epoch: 0.15621888272390508
Training Accuracy Epoch: 94.53729495512225


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 87.5 Loss -> 0.27288341522216797
The Total Accuracy for Epoch 18: 94.97059733828536
Training Loss Epoch: 0.14388973295393556
Training Accuracy Epoch: 94.97059733828536


0it [00:00, ?it/s]

0/11095 - Steps. Acc -> 98.4375 Loss -> 0.06788018345832825
The Total Accuracy for Epoch 19: 95.35747446610957
Training Loss Epoch: 0.13018875620742837
Training Accuracy Epoch: 95.35747446610957


# 4. Evaluation

## 4.1 Generate Results
We first generate the results of our model for each of our datasets

In [16]:
def get_tokens(text, tokenizer):
  inputs = tokenizer.batch_encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
  ids = inputs['input_ids']
  mask = inputs['attention_mask']
  return ids, mask

In [17]:
def predict(model,sents, doc):
  sent_id, sent_mask = get_tokens(sents,tokenizer)
  sent_id, sent_mask = torch.tensor(sent_id, dtype=torch.long).to(device) ,torch.tensor(sent_mask, dtype=torch.long).to(device)

  doc_id, doc_mask = get_tokens([doc],tokenizer)
  doc_id, doc_mask = doc_id * len(sents), doc_mask* len(sents)
  doc_id, doc_mask = torch.tensor(doc_id, dtype=torch.long).to(device) ,torch.tensor(doc_mask, dtype=torch.long).to(device)

  preds = model(sent_id, doc_id, sent_mask, doc_mask)
  return preds

In [21]:
def summarize(doc, model, min_sentence_length=0, top_k=3, batch_size=64):
  doc = doc.replace("\n","")
  doc_sentences = []
  for sent in nlp(doc).sents:
    if len(sent) > min_sentence_length:
      doc_sentences.append(str(sent))

  doc_id, doc_mask = get_tokens([doc],tokenizer)
  doc_id, doc_mask = doc_id * batch_size, doc_mask* batch_size
  doc_id, doc_mask = torch.tensor(doc_id, dtype=torch.long).to(device), torch.tensor(doc_mask, dtype=torch.long).to(device)

  scores = []
  # run predictions using some batch size
  for i in range(int(len(doc_sentences) / batch_size) + 1):
    batch_start = i*batch_size
    batch_end = (i+1) * batch_size if (i+1) * batch_size < len(doc) else len(doc)-1
    batch = doc_sentences[batch_start: batch_end]
    if batch:
      preds = predict(model, batch, doc)
      scores = scores + preds.tolist()

  sent_pred_list = [{"sentence": doc_sentences[i], "score": scores[i][0], "index":i} for i in range(len(doc_sentences))]
  sorted_sentences = sorted(sent_pred_list, key=lambda k: k['score'], reverse=True)

  sorted_result = sorted_sentences[:top_k]
  sorted_result = sorted(sorted_result, key=lambda k: k['index'])

  summary = [ x["sentence"] for x in sorted_result]
  summary = " ".join(summary)

  return summary, scores, doc_sentences

In [23]:
# Example text for summarization
example_text = train_df.iloc[10]
# Generate summary
summary, _, _ = summarize(example_text['reviewText'], model, min_sentence_length=14, top_k=3, batch_size=16)

# Print the results
print("Original Text:")
print(example_text['reviewText'])
print("\nGenerated Summary:")
print(summary)
print("\nActual Summary:")
print(example_text['summary'])

Original Text:
I've had these shoes for about a week now and have so far enjoyed using them. Considering the fact that I have wide feet, the shoes are slightly tight. However, it doesn't feel uncomfortable nor does it bothers me as I use them throughout my workouts. I know some people personally like when the shoes are a bit tighter or a bit looser so it's all in personal preference.

Generated Summary:
Considering the fact that I have wide feet, the shoes are slightly tight. However, it doesn't feel uncomfortable nor does it bothers me as I use them throughout my workouts. I know some people personally like when the shoes are a bit tighter or a bit looser so it's all in personal preference.

Actual Summary:
Wide Feet so Somewhat Tight


In [41]:
def create_summary(x):
  summary, _, _ = summarize(x, model, min_sentence_length=1, top_k=1, batch_size=16)
  return summary

In [42]:
train_predictions = train_df['reviewText'].apply(create_summary)
dev_predictions = dev_df['reviewText'].apply(create_summary)
test_predictions = test_df['reviewText'].apply(create_summary)

In [43]:
train_references = train_df['summary']
dev_references = dev_df['summary']
test_references = test_df['summary']

In [44]:
torch.cuda.empty_cache()

## 5.2 Evaluation Metrics
We will use both rouge_score and bert_score for our evaluation metrics. We will specifically be looking at the averaged f1 scores of both metrics.

In [45]:
def rouge_score(prediction, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(prediction, reference)
    return scores

In [46]:
def evaluate_rouge(predictions, references):
  rouge1_f = 0
  rouge2_f = 0
  rougeL_f = 0

  num_reviews = len(predictions)
  for pred, actual in zip(predictions, references):
    scores = rouge_score(pred, actual)
    rouge1_f += scores['rouge1'][2]
    rouge2_f += scores['rouge2'][2]
    rougeL_f += scores['rougeL'][2]

  rouge1_f = rouge1_f / num_reviews
  rouge2_f = rouge2_f / num_reviews
  rougeL_f = rougeL_f / num_reviews

  return (rouge1_f, rouge2_f, rougeL_f)

In [47]:
bertscore = load("bertscore")
def evaluate_bert(predictions, references):
  results = bertscore.compute(predictions=predictions,
                              references=references,
                              rescale_with_baseline=True,
                              lang='en')
  f1_scores = results['f1']
  if len(f1_scores) == 0:
    return None

  return sum(f1_scores) / len(f1_scores)


## 5.3 Evaluate Results

In [48]:
torch.cuda.empty_cache()

In [49]:
print("Training Eval")
rouge1_f, rouge2_f, rougeL_f = evaluate_rouge(train_predictions, train_references)
bert_f = evaluate_bert(train_predictions, train_references)
print("ROUGE-1 F-Score: ", rouge1_f)
print("ROUGE-2 F-Score: ", rouge2_f)
print("ROUGE-L F-Score: ", rougeL_f)
print("Bert F-Score: ", bert_f)
print("-------------")
print("Dev Eval")
rouge1_f, rouge2_f, rougeL_f = evaluate_rouge(dev_predictions, dev_references)
bert_f = evaluate_bert(dev_predictions, dev_references)
print("ROUGE-1 F-Score: ", rouge1_f)
print("ROUGE-2 F-Score: ", rouge2_f)
print("ROUGE-L F-Score: ", rougeL_f)
print("Bert F-Score: ", bert_f)
print("-------------")
print("Test Eval")
rouge1_f, rouge2_f, rougeL_f = evaluate_rouge(test_predictions, test_references)
bert_f = evaluate_bert(test_predictions, test_references)
print("ROUGE-1 F-Score: ", rouge1_f)
print("ROUGE-2 F-Score: ", rouge2_f)
print("ROUGE-L F-Score: ", rougeL_f)
print("Bert F-Score: ", bert_f)

Training Eval


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE-1 F-Score:  0.20594670308835092
ROUGE-2 F-Score:  0.12750313504513677
ROUGE-L F-Score:  0.19788807412619722
Bert F-Score:  0.15738695461672306
-------------
Dev Eval




ROUGE-1 F-Score:  0.2035273180846394
ROUGE-2 F-Score:  0.1202413029156827
ROUGE-L F-Score:  0.196933169098767
Bert F-Score:  0.16379498507151102
-------------
Test Eval




ROUGE-1 F-Score:  0.18988167126125058
ROUGE-2 F-Score:  0.10724853493350751
ROUGE-L F-Score:  0.18095755342090633
Bert F-Score:  0.13949474125972697


## Write our results to a file

In [50]:
def write_to_file(data, file_name):
    with open(file_name, 'w') as txtfile:
        for row in data:
            txtfile.write(str(row) + '\n')

In [51]:
write_to_file(train_predictions, 'extractive_train_pred.txt')
write_to_file(train_references, 'extractive_train_ref.txt')

In [52]:
write_to_file(dev_predictions, 'extractive_dev_pred.txt')
write_to_file(dev_references, 'extractive_dev_ref.txt')

In [53]:
write_to_file(test_predictions, 'extractive_test_pred.txt')
write_to_file(test_references, 'extractive_test_ref.txt')