In [10]:
import torch
import torch.nn as nn
import kagglehub
import os
import glob
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel
import nltk
from nltk.tokenize import sent_tokenize
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate import meteor_score

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# Configure pandas
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Check BERTScore
try:
    from bert_score import score as bertscore_fn
    BERTSCORE_AVAILABLE = True
    print("‚úÖ BERTScore available")
except ImportError:
    BERTSCORE_AVAILABLE = False
    print("‚ö†Ô∏è  BERTScore not available")

Using device: cuda
‚ö†Ô∏è  BERTScore not available


In [11]:
# ============================================================================
# STEP 1: Copy Essential Classes from Training Script
# ============================================================================

class SentenceEncoder(nn.Module):
    def __init__(self, modelname="law-ai/InLegalBERT", hiddendim=768):
        super(SentenceEncoder, self).__init__()
        print(f"Loading {modelname}...")
        self.tokenizer = AutoTokenizer.from_pretrained(modelname)
        self.model = AutoModel.from_pretrained(modelname)
        self.hiddendim = hiddendim
        
        for param in self.model.parameters():
            param.requires_grad = False
        
        print(f"{modelname} loaded successfully")
    
    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    def forward(self, sentences):
        encoded = self.tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors='pt')
        encoded = {k: v.to(device) for k, v in encoded.items()}
        
        with torch.no_grad():
            model_output = self.model(**encoded)
        
        embeddings = self.mean_pooling(model_output, encoded['attention_mask'])
        return embeddings


class MultiAspectPolicyNetwork(nn.Module):
    """CORRECTED: Uses underscores to match saved model"""
    def __init__(self, input_dim=768, hidden_dim=256, num_aspects=5, dropout=0.5):
        super(MultiAspectPolicyNetwork, self).__init__()
        
        self.num_aspects = num_aspects
        self.aspects = ['facts', 'analysis', 'argument', 'judgement', 'statute']
        self.hidden_dim = hidden_dim
        
        # NOTE: Use UNDERSCORES (not camelCase) to match saved model
        self.shared_lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, 
                                   bidirectional=True, batch_first=True, dropout=dropout)
        
        self.position_embedding = nn.Embedding(2000, 64)
        self.aspect_embedding = nn.Embedding(num_aspects, hidden_dim * 2)
        
        self.attention = nn.MultiheadAttention(
            embed_dim=hidden_dim * 2,
            num_heads=4, 
            dropout=dropout,
            batch_first=True
        )
        
        # Use underscores in dict key
        self.aspect_heads = nn.ModuleDict()
        for aspect in self.aspects:
            self.aspect_heads[aspect] = nn.Sequential(
                nn.Linear(hidden_dim * 2 + 64 + hidden_dim * 2, 512),
                nn.LayerNorm(512),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(512, 256),
                nn.LayerNorm(256),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(256, 1)
            )
    
    def forward(self, sentence_embeddings, positions, aspect_idx):
        lstm_out, _ = self.shared_lstm(sentence_embeddings)
        
        aspect_emb_query = self.aspect_embedding(torch.tensor([aspect_idx], device=device))
        aspect_emb_query = aspect_emb_query.unsqueeze(1).expand(-1, lstm_out.size(1), -1)
        
        attended_out, _ = self.attention(aspect_emb_query, lstm_out, lstm_out)
        
        combined_lstm = lstm_out + attended_out
        pos_emb = self.position_embedding(positions)
        aspect_emb_concat = self.aspect_embedding(torch.tensor([aspect_idx], device=device))
        aspect_emb_concat = aspect_emb_concat.unsqueeze(1).expand(-1, sentence_embeddings.size(1), -1)
        
        combined = torch.cat([combined_lstm, pos_emb, aspect_emb_concat], dim=-1)
        
        aspect_name = self.aspects[aspect_idx]
        logits = self.aspect_heads[aspect_name](combined).squeeze(-1)
        
        return logits

print("‚úÖ Policy Network defined with CORRECT naming (underscores)")
class UnsupervisedRLAgent:
    def __init__(self, encoder, policy):
        self.encoder = encoder.to(device)
        self.policy = policy.to(device)
        self.aspects = ['facts', 'analysis', 'argument', 'judgement', 'statute']
        self.aspectsummaryratios = {
            'facts': 0.12,
            'analysis': 0.12,
            'argument': 0.08,
            'judgement': 0.06,
            'statute': 0.08
        }
        self.minsummarysentences = 3
        self.maxdocumentsentences = 500
    
    def preprocess_document(self, judgmenttext):
        sentences = sent_tokenize(judgmenttext)
        sentences = [s.strip() for s in sentences if len(s.strip().split()) > 5]
        
        if len(sentences) > self.maxdocumentsentences:
            print(f"Truncating {len(sentences)} to {self.maxdocumentsentences} sentences")
            sentences = sentences[:self.maxdocumentsentences]
        
        return sentences
    
    def encode_sentences(self, sentences):
        if len(sentences) == 0:
            return torch.zeros(1, self.encoder.hiddendim).to(device)
        
        batchsize = 16
        embeddings = []
        
        for i in range(0, len(sentences), batchsize):
            batch = sentences[i:i+batchsize]
            emb = self.encoder(batch)
            embeddings.append(emb)
        
        return torch.cat(embeddings, dim=0)
    
    def generate_summaries(self, judgment):
        self.policy.eval()
        
        with torch.no_grad():
            sentences = self.preprocess_document(judgment)
            
            if len(sentences) < 3:
                return {aspect: " ".join(sentences) for aspect in self.aspects}
            
            sentenceembeddings = self.encode_sentences(sentences)
            sentenceembeddings = sentenceembeddings.unsqueeze(0)
            
            positions = torch.arange(min(len(sentences), 1999), device=device).unsqueeze(0)
            if len(sentences) < positions.size(1):
                positions = positions[:, :len(sentences)]
            
            summaries = {}
            
            for aspectidx, aspect in enumerate(self.aspects):
                logits = self.policy(sentenceembeddings, positions, aspectidx).squeeze(0)
                aspectratio = self.aspectsummaryratios[aspect]
                numselect = max(self.minsummarysentences, int(len(sentences) * aspectratio))
                topkindices = torch.topk(logits, k=numselect).indices
                topkindices = sorted(topkindices.cpu().numpy())
                summary = " ".join([sentences[i] for i in topkindices])
                summaries[aspect] = summary
            
            return summaries


‚úÖ Policy Network defined with CORRECT naming (underscores)


In [12]:
# ============================================================================
# STEP 2: Metrics Evaluator Class (NEW - For Reference-Based Evaluation)
# ============================================================================

class MetricsEvaluator:
    """Compute ROUGE, BLEU, METEOR, and BERTScore metrics"""
    
    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1
    
    def compute_all_metrics(self, generated_summary, reference_summary, source_document):
        """Compute all metrics for a generated summary against reference"""
        
        metrics = {}
        
        # ROUGE scores
        rouge_scores = self.rouge_scorer.score(reference_summary, generated_summary)
        metrics['rouge1_precision'] = rouge_scores['rouge1'].precision
        metrics['rouge1_recall'] = rouge_scores['rouge1'].recall
        metrics['rouge1_fmeasure'] = rouge_scores['rouge1'].fmeasure
        metrics['rouge2_precision'] = rouge_scores['rouge2'].precision
        metrics['rouge2_recall'] = rouge_scores['rouge2'].recall
        metrics['rouge2_fmeasure'] = rouge_scores['rouge2'].fmeasure
        metrics['rougeL_precision'] = rouge_scores['rougeL'].precision
        metrics['rougeL_recall'] = rouge_scores['rougeL'].recall
        metrics['rougeL_fmeasure'] = rouge_scores['rougeL'].fmeasure
        
        # BLEU scores
        reference_tokens = reference_summary.split()
        generated_tokens = generated_summary.split()
        
        try:
            metrics['bleu1'] = sentence_bleu([reference_tokens], generated_tokens, weights=(1, 0, 0, 0), smoothing_function=self.smoothing)
            metrics['bleu2'] = sentence_bleu([reference_tokens], generated_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=self.smoothing)
            metrics['bleu4'] = sentence_bleu([reference_tokens], generated_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=self.smoothing)
        except:
            metrics['bleu1'] = 0.0
            metrics['bleu2'] = 0.0
            metrics['bleu4'] = 0.0
        
        # METEOR score
        try:
            metrics['meteor'] = meteor_score.meteor_score([reference_tokens], generated_tokens)
        except:
            metrics['meteor'] = 0.0
        
        # BERTScore
        if BERTSCORE_AVAILABLE:
            try:
                P, R, F1 = bertscore_fn([generated_summary], [reference_summary], lang='en', rescale_with_baseline=True)
                metrics['bertscore_precision'] = P.item()
                metrics['bertscore_recall'] = R.item()
                metrics['bertscore_f1'] = F1.item()
            except:
                metrics['bertscore_precision'] = 0.0
                metrics['bertscore_recall'] = 0.0
                metrics['bertscore_f1'] = 0.0
        
        # Length statistics
        metrics['generated_length'] = len(generated_tokens)
        metrics['reference_length'] = len(reference_tokens)
        metrics['source_length'] = len(source_document.split())
        metrics['length_ratio'] = metrics['generated_length'] / max(metrics['reference_length'], 1)
        metrics['compression_ratio'] = metrics['generated_length'] / max(metrics['source_length'], 1)
        
        return metrics


üìä LOADING VALIDATION DATASET

Judgment directory: /kaggle/input/summaries/IN-Ext/judgement
Summary base directory: /kaggle/input/summaries/IN-Ext/summary/segment-wise/A2



NameError: name 'MultiAspectLegalDataset' is not defined

In [15]:
# ============================================================================
# STEP 3: Dataset Class
# ============================================================================

class MultiAspectLegalDataset(Dataset):
    def __init__(self, judgment_dir, summary_base_dir):
        self.aspects = ['facts', 'analysis', 'argument', 'judgement', 'statute']
        self.judgment_files = sorted(glob.glob(os.path.join(judgment_dir, '*.txt')))
        
        print(f"üìÅ Found {len(self.judgment_files)} judgments")
        
        self.aspect_summary_files = {}
        for aspect in self.aspects:
            aspect_path = os.path.join(summary_base_dir, aspect)
            files = sorted(glob.glob(os.path.join(aspect_path, '*.txt')))
            self.aspect_summary_files[aspect] = files
            print(f"   {aspect:12s}: {len(files)} summaries")
        
        self.data = []
        for idx, jf in enumerate(self.judgment_files):
            with open(jf, 'r', encoding='utf-8', errors='ignore') as f:
                judgment = f.read().strip()
            
            if not judgment:
                continue
            
            aspect_summaries = {}
            has_valid_summary = False
            
            for aspect in self.aspects:
                if idx < len(self.aspect_summary_files[aspect]):
                    summary_file = self.aspect_summary_files[aspect][idx]
                    with open(summary_file, 'r', encoding='utf-8', errors='ignore') as f:
                        summary = f.read().strip()
                    
                    if summary and len(summary.split()) > 10:
                        aspect_summaries[aspect] = summary
                        has_valid_summary = True
                    else:
                        aspect_summaries[aspect] = None
                else:
                    aspect_summaries[aspect] = None
            
            if has_valid_summary:
                self.data.append({
                    'judgment': judgment,
                    'summaries': aspect_summaries,
                    'judgment_file': os.path.basename(jf)
                })
        
        print(f"\nüìä Total valid samples loaded: {len(self.data)}")
        for aspect in self.aspects:
            valid_count = sum(1 for item in self.data if item['summaries'][aspect] is not None)
            print(f"   {aspect:12s}: {valid_count} valid summaries")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]


In [17]:
# ============================================================================
# STEP 3: Load Validation Dataset with Correct Paths
# ============================================================================
path  = "/kaggle/input/summaries"
print("\n" + "="*70)
print("üìä LOADING VALIDATION DATASET")
print("="*70 + "\n")

# Set correct paths based on the dataset structure
judgment_dir = os.path.join(path,  'IN-Ext', 'judgement')
summary_base_dir = os.path.join(path,  'IN-Ext', 'summary', 'segment-wise', 'A2')

print(f"Judgment directory: {judgment_dir}")
print(f"Summary base directory: {summary_base_dir}\n")

# Verify paths exist
if not os.path.exists(judgment_dir):
    raise FileNotFoundError(f"Judgment directory not found: {judgment_dir}")
if not os.path.exists(summary_base_dir):
    raise FileNotFoundError(f"Summary directory not found: {summary_base_dir}")

validation_dataset = MultiAspectLegalDataset(judgment_dir, summary_base_dir)



üìä LOADING VALIDATION DATASET

Judgment directory: /kaggle/input/summaries/IN-Ext/judgement
Summary base directory: /kaggle/input/summaries/IN-Ext/summary/segment-wise/A2

üìÅ Found 50 judgments
   facts       : 50 summaries
   analysis    : 50 summaries
   argument    : 46 summaries
   judgement   : 50 summaries
   statute     : 41 summaries

üìä Total valid samples loaded: 50
   facts       : 50 valid summaries
   analysis    : 50 valid summaries
   argument    : 46 valid summaries
   judgement   : 47 valid summaries
   statute     : 37 valid summaries


In [19]:
# ============================================================================
# STEP 5: Load Trained Model
# ============================================================================

print("\n" + "="*70)
print("üîÑ LOADING TRAINED MODEL")
print("="*70 + "\n")

# Initialize encoder and policy
encoder = SentenceEncoder(modelname="law-ai/InLegalBERT", hiddendim=768)
policy = MultiAspectPolicyNetwork(input_dim=768, hidden_dim=256, num_aspects=5, dropout=0.5)

# Load checkpoint
model_path = "/kaggle/input/inlegal-50-data-unsupervised/pytorch/default/1/inlegalbert-50-unsupervised_legal_summarization.pt"
checkpoint = torch.load(model_path, map_location=device, weights_only=False)

# Get state dicts
state_dict = checkpoint['policy_state_dict']
current_model_dict = policy.state_dict()

# Handle position embedding size mismatch
if 'position_embedding.weight' in state_dict:
    old_pos_emb = state_dict['position_embedding.weight']
    new_pos_emb = current_model_dict['position_embedding.weight']
    
    if old_pos_emb.shape[0] != new_pos_emb.shape[0]:
        print(f"‚ö†Ô∏è  Resizing position embeddings: {old_pos_emb.shape[0]} ‚Üí {new_pos_emb.shape[0]}")
        
        old_size, emb_dim = old_pos_emb.shape
        new_size = new_pos_emb.shape[0]
        
        # Create resized embedding
        resized_pos_emb = torch.zeros(new_size, emb_dim)
        resized_pos_emb[:old_size, :] = old_pos_emb  # Copy trained weights
        
        # Initialize new positions (501-2000) with small random values
        if new_size > old_size:
            resized_pos_emb[old_size:, :] = torch.randn(new_size - old_size, emb_dim) * 0.02
        
        state_dict['position_embedding.weight'] = resized_pos_emb

# Load the corrected state dict
policy.load_state_dict(state_dict)

# Create agent
agent = UnsupervisedRLAgent(encoder=encoder, policy=policy)
agent.policy.eval()
agent.policy.to(device)

print(f"‚úÖ Model loaded successfully from {model_path}")
print(f"üìä Training completed at epoch {checkpoint['epoch']}")
print("="*70 + "\n")



üîÑ LOADING TRAINED MODEL

Loading law-ai/InLegalBERT...
law-ai/InLegalBERT loaded successfully
‚ö†Ô∏è  Resizing position embeddings: 500 ‚Üí 2000
‚úÖ Model loaded successfully from /kaggle/input/inlegal-50-data-unsupervised/pytorch/default/1/inlegalbert-50-unsupervised_legal_summarization.pt
üìä Training completed at epoch 1



In [20]:
# ============================================================================
# STEP 6: Evaluate with Reference Summaries
# ============================================================================

print("\n" + "="*70)
print("üìä COMPREHENSIVE VALIDATION SET EVALUATION WITH REFERENCE SUMMARIES")
print("="*70 + "\n")

# Initialize metrics evaluator
metrics_evaluator = MetricsEvaluator()

# Collect metrics for each aspect
aspect_metrics = {aspect: [] for aspect in agent.aspects}

print("Computing all metrics on validation set...\n")

for idx, item in enumerate(validation_dataset):
    judgment = item['judgment']
    reference_summaries = item['summaries']
    
    # Generate summaries
    generated_summaries = agent.generate_summaries(judgment)
    
    # Compute metrics for each aspect
    for aspect in agent.aspects:
        if reference_summaries[aspect]:
            metrics = metrics_evaluator.compute_all_metrics(
                generated_summaries[aspect],
                reference_summaries[aspect],
                judgment
            )
            aspect_metrics[aspect].append(metrics)
    
    if (idx + 1) % 10 == 0:
        print(f"Processed {idx + 1}/{len(validation_dataset)} samples...")

print(f"\n{'='*70}")
print("VALIDATION RESULTS")
print(f"{'='*70}\n")

# Calculate average metrics for each aspect
for aspect in agent.aspects:
    if aspect_metrics[aspect]:
        print(f"\n{'='*70}")
        print(f"üìà {aspect.upper()} - DETAILED METRICS")
        print(f"{'='*70}")
        
        # Average all metrics
        avg_metrics = {}
        for key in aspect_metrics[aspect][0].keys():
            avg_metrics[key] = np.mean([m[key] for m in aspect_metrics[aspect]])
        
        print(f"\nüî¥ ROUGE Scores:")
        print(f"   ROUGE-1:  P={avg_metrics['rouge1_precision']:.4f}  R={avg_metrics['rouge1_recall']:.4f}  F1={avg_metrics['rouge1_fmeasure']:.4f}")
        print(f"   ROUGE-2:  P={avg_metrics['rouge2_precision']:.4f}  R={avg_metrics['rouge2_recall']:.4f}  F1={avg_metrics['rouge2_fmeasure']:.4f}")
        print(f"   ROUGE-L:  P={avg_metrics['rougeL_precision']:.4f}  R={avg_metrics['rougeL_recall']:.4f}  F1={avg_metrics['rougeL_fmeasure']:.4f}")
        
        print(f"\nüîµ BLEU Scores:")
        print(f"   BLEU-1:   {avg_metrics['bleu1']:.4f}")
        print(f"   BLEU-2:   {avg_metrics['bleu2']:.4f}")
        print(f"   BLEU-4:   {avg_metrics['bleu4']:.4f}")
        
        print(f"\nüü¢ Other Metrics:")
        print(f"   METEOR:   {avg_metrics['meteor']:.4f}")
        
        if BERTSCORE_AVAILABLE:
            print(f"\nüü° BERTScore:")
            print(f"   Precision: {avg_metrics['bertscore_precision']:.4f}")
            print(f"   Recall:    {avg_metrics['bertscore_recall']:.4f}")
            print(f"   F1:        {avg_metrics['bertscore_f1']:.4f}")
        
        print(f"\nüìè Length Statistics:")
        print(f"   Generated Length:    {avg_metrics['generated_length']:.0f} words")
        print(f"   Reference Length:    {avg_metrics['reference_length']:.0f} words")
        print(f"   Source Length:       {avg_metrics['source_length']:.0f} words")
        print(f"   Length Ratio:        {avg_metrics['length_ratio']:.2f}")
        print(f"   Compression Ratio:   {avg_metrics['compression_ratio']:.2%}")
    else:
        print(f"\n{aspect}: No validation samples available")

print(f"\n{'='*70}\n")


üìä COMPREHENSIVE VALIDATION SET EVALUATION WITH REFERENCE SUMMARIES

Computing all metrics on validation set...

Processed 10/50 samples...
Processed 20/50 samples...
Processed 30/50 samples...
Processed 40/50 samples...
Processed 50/50 samples...

VALIDATION RESULTS


üìà FACTS - DETAILED METRICS

üî¥ ROUGE Scores:
   ROUGE-1:  P=0.4119  R=0.5836  F1=0.4484
   ROUGE-2:  P=0.1889  R=0.2775  F1=0.2069
   ROUGE-L:  P=0.2063  R=0.3097  F1=0.2277

üîµ BLEU Scores:
   BLEU-1:   0.3550
   BLEU-2:   0.2377
   BLEU-4:   0.1433

üü¢ Other Metrics:
   METEOR:   0.3179

üìè Length Statistics:
   Generated Length:    599 words
   Reference Length:    423 words
   Source Length:       5387 words
   Length Ratio:        1.94
   Compression Ratio:   11.07%

üìà ANALYSIS - DETAILED METRICS

üî¥ ROUGE Scores:
   ROUGE-1:  P=0.6575  R=0.5081  F1=0.5554
   ROUGE-2:  P=0.3377  R=0.2589  F1=0.2841
   ROUGE-L:  P=0.3354  R=0.2573  F1=0.2819

üîµ BLEU Scores:
   BLEU-1:   0.4167
   BLEU-2:   0.296

In [None]:
# ============================================================================
# STEP 7: Display Sample Summaries
# ============================================================================

print("\n" + "="*70)
print("üìÑ SAMPLE SUMMARIES WITH METRICS")
print("="*70 + "\n")

num_samples = min(3, len(validation_dataset))

for idx in range(num_samples):
    item = validation_dataset[idx]
    judgment = item['judgment']
    reference_summaries = item['summaries']
    judgment_file = item['judgment_file']
    
    print("\n" + "="*70)
    print(f"SAMPLE {idx + 1} - File: {judgment_file}")
    print("="*70)
    
    sentences = agent.preprocess_document(judgment)
    source_words = sum(len(s.split()) for s in sentences)
    print(f"\nüìÑ SOURCE DOCUMENT:")
    print(f"   Total Sentences: {len(sentences)}")
    print(f"   Total Words:     {source_words}")
    print(f"\n   First 300 characters:")
    print(f"   {judgment[:300]}...\n")
    
    # Generate summaries
    generated_summaries = agent.generate_summaries(judgment)
    
    # Display each aspect
    for aspect in agent.aspects:
        print(f"\n{'‚îÄ'*70}")
        print(f"üéØ ASPECT: {aspect.upper()}")
        print(f"{'‚îÄ'*70}")
        
        if reference_summaries[aspect]:
            ref_summary = reference_summaries[aspect]
            gen_summary = generated_summaries[aspect]
            
            # Compute metrics
            metrics = metrics_evaluator.compute_all_metrics(gen_summary, ref_summary, judgment)
            
            print(f"\nüìä METRICS:")
            print(f"   ROUGE-1 F1: {metrics['rouge1_fmeasure']:.4f}")
            print(f"   ROUGE-2 F1: {metrics['rouge2_fmeasure']:.4f}")
            print(f"   ROUGE-L F1: {metrics['rougeL_fmeasure']:.4f}")
            print(f"   BLEU-4:     {metrics['bleu4']:.4f}")
            print(f"   METEOR:     {metrics['meteor']:.4f}")
            if BERTSCORE_AVAILABLE:
                print(f"   BERTScore:  {metrics['bertscore_f1']:.4f}")
            print(f"   Gen Length: {metrics['generated_length']} words")
            print(f"   Ref Length: {metrics['reference_length']} words")
            print(f"   Compression: {metrics['compression_ratio']:.2%}")
            
            print(f"\nüìù REFERENCE SUMMARY ({len(ref_summary.split())} words):")
            print(f"{'‚îÄ'*70}")
            print(ref_summary + "...")
            
            print(f"\nü§ñ GENERATED SUMMARY ({len(gen_summary.split())} words):")
            print(f"{'‚îÄ'*70}")
            print(gen_summary+ "...")
        else:
            print(f"\n   ‚ö†Ô∏è  No reference summary available")
    
    print("\n" + "="*70 + "\n")

print("\n‚úÖ Evaluation complete!")