# Abstractive summarization
### Method 1 - Model Evaluation (src/evaluation.ipynb)
Performance metrics – ROUGE (Recall-Oriented Understudy for Gisting Evaluation)

implemented works:
- Load fine-trained transformer
    - From saved model 
 
- OOP implementation of Dataset 
    - Feature, Target
    - Tokenize
    - Padding, Truncate
    - Convert to Tensor
    - Pass to: DataLoader – with batch size

- Evaluate Model
    - Set model to evaluation mode
    - Load ROUGE metric
    - Loop through batches in dataloader
    - Move data to device
    - Generate summaries
    - Decode predictions and labels
    - Add to ROUGE metric
    - Compute ROUGE scores

- Evaluate model on validation dataset
    - Print results


## Observations:

The trained model from method 1 was not used for deployment:

(Trained model from method 2 was used for deployment)

Reason:
- Even though the model has very minimal training loss but, the model performed inconsistenly in validation & testing phase.
- There's a suspected tensor error while training using method 1, which could be attributed to the inconsistency of the model's output.

In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer

# OOP implementation of Dataset 
class SummarizationDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=512):
        self.dataset = pd.read_csv(file_path) # file path
        self.tokenizer = tokenizer # Tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset.iloc[idx, 0] # Feature
        summary = self.dataset.iloc[idx, 1] # Target 
        
        inputs = self.tokenizer.encode_plus(
            text, # Feature
            max_length=self.max_length,
            padding='max_length', # Padding 
            truncation=True, # Truncate
            return_tensors="pt" # Convert to Tensor
        )
        targets = self.tokenizer.encode_plus(
            summary, # Target
            max_length=self.max_length,
            padding='max_length', # Padding 
            truncation=True, # Truncate
            return_tensors="pt" # Convert to Tensor
        )
        
        return {
            'input_ids': inputs['input_ids'].flatten(), # feature - converts - mutli-dimentional tensor to one dimensional tesor
            'attention_mask': inputs['attention_mask'].flatten(), # padding - attention mask - ' '
            'labels': targets['input_ids'].flatten() # target - ' '
        }

# tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
val_dataset = SummarizationDataset('/home/mohan/infy/data/merged/final/validation.csv', tokenizer) # Data object
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [2]:
import torch
from transformers import BartForConditionalGeneration

# Load saved model
model_path = '/home/mohan/infy/models/fine_tuned_bart'
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device) # use cuda backend

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_l

In [3]:
from datasets import load_metric

def evaluate_model(model, dataloader, tokenizer):
    model.eval() # set model to evaluation mode
    rouge = load_metric('rouge') # ROGUE metric - 'HuggingFace'

    # Loop through batched
    for batch in dataloader: 
        inputs = batch['input_ids'].to(device) # feature
        attention_mask = batch['attention_mask'].to(device) # padding - attention mask
        labels = batch['labels'].to(device) # targets
        
        with torch.no_grad():  # no backpropagation needed
            outputs = model.generate(
                input_ids=inputs,  # features
                attention_mask=attention_mask, # attention mask 
                max_length=150, # Maximum length for summerized text
                min_length=40, # Minimum length for summerized text
            )

        # Decode the generated summaries and labels to human-readable text
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Add the decoded predictions and labels to the ROUGE metric
        rouge.add_batch(predictions=decoded_preds, references=decoded_labels)

     # Compute the final ROUGE scores
    result = rouge.compute()
    return result

# Evaluate the model on the validation dataset and print the results
results = evaluate_model(model, val_loader, tokenizer)
print(results)


  rouge = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'rouge1': AggregateScore(low=Score(precision=0.00012757136022962845, recall=4.933423696380165e-05, fmeasure=6.732234645068214e-05), mid=Score(precision=0.00030696858555254343, recall=0.00016308813027728812, fmeasure=0.00018227662111640694), high=Score(precision=0.0005062988359113381, recall=0.0003339877030881791, fmeasure=0.00031219502877184973)), 'rouge2': AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.0, recall=0.0, fmeasure=0.0), high=Score(precision=0.0, recall=0.0, fmeasure=0.0)), 'rougeL': AggregateScore(low=Score(precision=0.00012747169510444914, recall=5.11262189764358e-05, fmeasure=6.883905408089956e-05), mid=Score(precision=0.00029500877053101576, recall=0.00013774946674766604, fmeasure=0.00016051378457393505), high=Score(precision=0.0004903524158826344, recall=0.0002557525156737812, fmeasure=0.0002807499266670898)), 'rougeLsum': AggregateScore(low=Score(precision=0.00011561154520810078, recall=5.25348883052483e-05, fmeasure=6.754474

In [10]:
df_1 = pd.DataFrame(results)

df_1

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
0,"(0.00012757136022962845, 4.933423696380165e-05...","(0.0, 0.0, 0.0)","(0.00012747169510444914, 5.11262189764358e-05,...","(0.00011561154520810078, 5.25348883052483e-05,..."
1,"(0.00030696858555254343, 0.0001630881302772881...","(0.0, 0.0, 0.0)","(0.00029500877053101576, 0.0001377494667476660...","(0.00029301546802742783, 0.000141219374677299,..."
2,"(0.0005062988359113381, 0.0003339877030881791,...","(0.0, 0.0, 0.0)","(0.0004903524158826344, 0.0002557525156737812,...","(0.0005063985010365173, 0.0002518490193267542,..."


In [11]:

data = {
    "Score Type": ["Low", "Mid", "High"],
    "ROUGE-1 Precision": [0.00012757136022962845, 0.00030696858555254343, 0.0005062988359113381],
    "ROUGE-1 Recall": [4.933423696380165e-05, 0.00016308813027728812, 0.0003339877030881791],
    "ROUGE-1 F-measure": [6.732234645068214e-05, 0.00018227662111640694, 0.00031219502877184973],
    "ROUGE-2 Precision": [0.0, 0.0, 0.0],
    "ROUGE-2 Recall": [0.0, 0.0, 0.0],
    "ROUGE-2 F-measure": [0.0, 0.0, 0.0],
    "ROUGE-L Precision": [0.00012747169510444914, 0.00029500877053101576, 0.0004903524158826344],
    "ROUGE-L Recall": [5.11262189764358e-05, 0.00013774946674766604, 0.0002557525156737812],
    "ROUGE-L F-measure": [6.883905408089956e-05, 0.00016051378457393505, 0.0002807499266670898],
    "ROUGE-Lsum Precision": [0.00011561154520810078, 0.00029301546802742783, 0.0005063985010365173],
    "ROUGE-Lsum Recall": [5.25348883052483e-05, 0.000141219374677299, 0.0002518490193267542],
    "ROUGE-Lsum F-measure": [6.754474666699242e-05, 0.0001637905834947893, 0.00028050684026071287],
}

df2 = pd.DataFrame(data)

df2

Unnamed: 0,Score Type,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-1 F-measure,ROUGE-2 Precision,ROUGE-2 Recall,ROUGE-2 F-measure,ROUGE-L Precision,ROUGE-L Recall,ROUGE-L F-measure,ROUGE-Lsum Precision,ROUGE-Lsum Recall,ROUGE-Lsum F-measure
0,Low,0.000128,4.9e-05,6.7e-05,0.0,0.0,0.0,0.000127,5.1e-05,6.9e-05,0.000116,5.3e-05,6.8e-05
1,Mid,0.000307,0.000163,0.000182,0.0,0.0,0.0,0.000295,0.000138,0.000161,0.000293,0.000141,0.000164
2,High,0.000506,0.000334,0.000312,0.0,0.0,0.0,0.00049,0.000256,0.000281,0.000506,0.000252,0.000281
