In [2]:
!pip install transformers torch accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.1/59.1 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1


In [None]:
from huggingface_hub import login
# When you run this, a box will appear. Paste your token there.
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [12]:
import os
import json
import ast
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# --- 1. DATA PREPARATION ---

class AnchoredEmailDataset(Dataset):
    def __init__(self, json_folder, txt_folder, tokenizer, max_input=512, max_target=128):
        self.examples = []
        json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]

        print(f"Loading {len(json_files)} files with Trigger Anchoring...")
        for f_name in json_files:
            try:
                with open(os.path.join(json_folder, f_name), 'r') as f:
                    data = json.load(f)

                # Extract anchored prompt
                input_text = self.format_anchored_input(data)

                # Match naming: filename.json -> filename._summary.txt
                txt_name = f_name.replace('.json', '._summary.txt')
                txt_path = os.path.join(txt_folder, txt_name)

                if os.path.exists(txt_path):
                    with open(txt_path, 'r') as f:
                        summary_text = f.read()

                    model_inputs = tokenizer(
                        input_text,
                        max_length=max_input,
                        truncation=True,
                        padding="max_length"
                    )

                    labels = tokenizer(
                        text_target=summary_text,
                        max_length=max_target,
                        truncation=True,
                        padding="max_length"
                    )

                    model_inputs["labels"] = labels["input_ids"]
                    self.examples.append(model_inputs)
            except Exception as e:
                print(f"Error processing {f_name}: {e}")

    def format_anchored_input(self, data):
        """Extracts triggers and intents to create an 'anchored' prompt."""
        all_turns = data.get('sentences', [])
        events_data = data.get('events', {})

        # 1. Extract Intents and Trigger Words
        anchors = []
        for turn_id, turn_events in events_data.items():
            for event_type, details in turn_events.items():
                for trigger in details.get('triggers', []):
                    # Handle string-encoded dictionaries in your JSON
                    if isinstance(trigger, str) and '{' in trigger:
                        try:
                            t_data = ast.literal_eval(trigger)
                            trigger_word = t_data.get('words', 'Unknown')
                        except:
                            trigger_word = trigger
                    else:
                        trigger_word = trigger
                    anchors.append(f"[{event_type}: {trigger_word}]")

        # 2. Extract Thread Text
        thread_text = ""
        for i, turn_sentences in enumerate(all_turns):
            # If sentences is a list of lists of words
            turn_text = " ".join(turn_sentences) if isinstance(turn_sentences[0], str) else " ".join([" ".join(s) for s in turn_sentences])
            thread_text += f"Email {i}: {turn_text} "

        # 3. Build the final prompt
        # Prefixing with "summarize:" helps BART understand the task
        anchor_str = " ".join(set(anchors)) # Use set to remove duplicates
        final_input = f"summarize: {anchor_str} THREAD: {thread_text}"
        return final_input

    def __len__(self): return len(self.examples)
    def __getitem__(self, i):
        return {key: torch.tensor(val) for key, val in self.examples[i].items()}

# --- 2. INITIALIZATION ---

model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# --- 3. TRAINING ---

dataset = AnchoredEmailDataset(
    json_folder='/content/drive/MyDrive/train_jsons',
    txt_folder='/content/drive/MyDrive/openai_summaries',
    tokenizer=tokenizer
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./distilbart-anchored-email",
    per_device_train_batch_size=4,
    num_train_epochs=5, # Increased epochs since anchoring adds complexity
    learning_rate=5e-5,
    fp16=torch.cuda.is_available(),
    logging_steps=10,
    save_strategy="epoch",
    predict_with_generate=True,
    generation_max_length=128,
    weight_decay=0.01,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

trainer.train()
trainer.save_model("./final_anchored_model")
tokenizer.save_pretrained("./final_anchored_model")
trainer.push_to_hub()


Loading weights:   0%|          | 0/358 [00:00<?, ?it/s]



Loading 1200 files with Trigger Anchoring...
Error processing ybarbo-p_inbox_364.json: sequence item 8: expected str instance, list found


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
10,7.755952
20,2.911293
30,1.478515
40,1.24694
50,1.270422
60,1.041639
70,1.181111
80,1.066985
90,0.80607
100,0.941403


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...d-email/model.safetensors:   0%|          |  131kB / 1.63GB            

  ...d-email/training_args.bin:  31%|###       | 1.65kB / 5.33kB            

CommitInfo(commit_url='https://huggingface.co/JohnnyB31/distilbart-anchored-email/commit/9cdfa8a041c7806ae183583c9f9e6e1acf9347f3', commit_message='End of training', commit_description='', oid='9cdfa8a041c7806ae183583c9f9e6e1acf9347f3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/JohnnyB31/distilbart-anchored-email', endpoint='https://huggingface.co', repo_type='model', repo_id='JohnnyB31/distilbart-anchored-email'), pr_revision=None, pr_num=None)

In [4]:
import os
import json
import ast
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# --- Conditional Model Loading ---
# Check if a model and tokenizer are already loaded in the global scope
# from a previous execution (e.g., training cell 'bPVSu65XS7Bo')
current_trainer = globals().get('trainer')
current_model = current_trainer.model if current_trainer else None
current_tokenizer = globals().get('tokenizer')

if isinstance(current_model, AutoModelForSeq2SeqLM) and \
   isinstance(current_tokenizer, AutoTokenizer):
    print("üí° Using existing trained model and tokenizer from the current session.")
    model = current_model
    tokenizer = current_tokenizer
else:
    print("üåç Model and tokenizer not found in current session or are not the expected type. Loading from Hugging Face Hub...")
    # --- HF REPO ---
    model_path = "JohnnyB31/distilbart-anchored-email" # Replace with your-huggingface-username/your-repo-name if you pushed your own
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")
    print("‚úÖ Model and tokenizer loaded from Hugging Face Hub.")



üåç Model and tokenizer not found in current session or are not the expected type. Loading from Hugging Face Hub...


Loading weights:   0%|          | 0/358 [00:00<?, ?it/s]



‚úÖ Model and tokenizer loaded from Hugging Face Hub.


In [5]:
import torch
import ast
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def generate_anchored_summary(json_data, model, tokenizer):
    # 1. Extract Anchors (Matches Training Logic)
    events_data = json_data.get('events', {})
    anchors = []
    for turn_id, turn_events in events_data.items():
        for event_type, details in turn_events.items():
            for trigger in details.get('triggers', []):
                if isinstance(trigger, str) and '{' in trigger:
                    try:
                        trigger_word = ast.literal_eval(trigger).get('words', 'Unknown')
                    except:
                        trigger_word = trigger
                else:
                    trigger_word = trigger
                anchors.append(f"[{event_type}: {trigger_word}]")

    # 2. Extract Thread
    all_turns = json_data.get('sentences', [])
    thread_text = ""
    for i, turn in enumerate(all_turns):
        turn_text = " ".join(turn) if isinstance(turn[0], str) else " ".join([" ".join(s) for s in turn])
        thread_text += f"Email {i}: {turn_text} "

    # 3. Final Prompt Construction
    anchor_str = " ".join(set(anchors))
    final_input = f"summarize: {anchor_str} THREAD: {thread_text}"
    print(final_input)

    # 4. Manual Inference
    device = "cuda" if torch.cuda.is_available() else "cpu"
    inputs = tokenizer(final_input, return_tensors="pt", truncation=True, max_length=512).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            num_beams=5,           # Beam search reduces 'random' hallucinations
            max_length=128,
            min_length=20,
            no_repeat_ngram_size=3 # Prevents the model from getting stuck in loops
        )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [8]:
test_data = json.load(open('/content/drive/MyDrive/test_jsons/germany-c_inbox223.json'))
print(generate_anchored_summary(test_data, model, tokenizer))

summarize: [Request_Action: handle] [Amend_Data: included the confirm] [Deliver_Action_Data: checked] [Deliver_Action_Data: forwarding] [Request_Action: check] [Deliver_Data: attached please find the confirm] [Request_Data: send me the contract] [Deliver_Data: attached is contract] [Deliver_Action_Data: will enter] [Deliver_Action_Data: is out] [Request_Data: What 's the status] [Request_Action_Data: Who could get me a copy] [Deliver_Data: attached please find a copy] [Deliver_Action_Data: going] [Deliver_Action_Data: asked] [Request_Data: get me a copy] [Request_Data: send me the confirm] THREAD: Email 0: Brenda , I am going to put the National Fuel demand charge on deal 241639 . I will enter it as a negative demand charge . Elizabeth , would you get me a copy of the contract or confirm for deal 241639 please ? Thanks Email 1: What 's the status on this ? Email 2: Elizabeth is out of the office for a while . Who could get me a copy of the contract/confirm for deal 241639 ? thanks Emai

In [9]:
# ROUGE EVALUATION
!pip install pandas evaluate rouge_score -q

import evaluate
import json
import os
import ast
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd

# IMPORTANT: Replace 'your-huggingface-username' with your actual Hugging Face username
# model_path = "JohnnyB31/distilbart-anchored"  # Your model path
# print(f"üìÇ Loading trained model from: {model_path}")
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
model.eval()

# Initialize ROUGE
rouge = evaluate.load("rouge")

class EvalDataset(Dataset):
    def __init__(self, json_folder, txt_folder, tokenizer, max_input=512, max_samples=50):
        self.examples = []
        json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]

        # Use fewer samples for quick evaluation
        json_files = json_files[:max_samples]

        print(f"üìä Loading {len(json_files)} files for evaluation...")
        for f_name in json_files:
            try:
                with open(os.path.join(json_folder, f_name), 'r', encoding='utf-8') as f:
                    data = json.load(f)

                # Extract anchored prompt - MUST MATCH YOUR TRAINING FORMAT
                input_text = self.format_anchored_input(data)

                # Get summary file - MATCHING YOUR NAMING CONVENTION
                txt_name = f_name.replace('.json', '._summary.txt')
                txt_path = os.path.join(txt_folder, txt_name)

                if os.path.exists(txt_path):
                    with open(txt_path, 'r', encoding='utf-8') as f:
                        summary_text = f.read().strip()

                    # Tokenize input
                    inputs = tokenizer(
                        input_text,
                        max_length=max_input,
                        truncation=True,
                        padding="max_length",
                        return_tensors="pt"
                    )

                    self.examples.append({
                        'input_ids': inputs['input_ids'].squeeze(),
                        'attention_mask': inputs['attention_mask'].squeeze(),
                        'reference': summary_text,
                        'file_name': f_name
                    })

            except Exception as e:
                print(f"‚ö†Ô∏è Error processing {f_name}: {str(e)[:50]}...")

    def format_anchored_input(self, data):
        """EXACTLY matches your training format_anchored_input function"""
        all_turns = data.get('sentences', [])
        events_data = data.get('events', {})

        # 1. Extract Intents and Trigger Words
        anchors = []
        for turn_id, turn_events in events_data.items():
            for event_type, details in turn_events.items():
                for trigger in details.get('triggers', []):
                    if isinstance(trigger, str) and '{' in trigger:
                        try:
                            t_data = ast.literal_eval(trigger)
                            trigger_word = t_data.get('words', 'Unknown')
                        except:
                            trigger_word = trigger
                    else:
                        trigger_word = trigger
                    anchors.append(f"[{event_type}: {trigger_word}]")

        # 2. Extract Thread Text
        thread_text = ""
        for i, turn_sentences in enumerate(all_turns):
            # If sentences is a list of lists of words
            turn_text = " ".join(turn_sentences) if isinstance(turn_sentences[0], str) else " ".join([" ".join(s) for s in turn_sentences])
            thread_text += f"Email {i}: {turn_text} "

        # 3. Build the final prompt
        anchor_str = " ".join(set(anchors))  # Use set to remove duplicates
        final_input = f"summarize: {anchor_str} THREAD: {thread_text}"
        return final_input

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

# Create evaluation dataset - USING YOUR EXACT PATHS
eval_dataset = EvalDataset(
    json_folder='/content/drive/MyDrive/test_jsons',  # Your JSON folder
    txt_folder='/content/drive/MyDrive/openai_summaries',  # Your summaries folder
    tokenizer=tokenizer,
    max_samples=20  # Evaluate on first 20 files
)

print(f"\nüìä Ready to evaluate on {len(eval_dataset)} samples...")

# Generate predictions
all_predictions = []
all_references = []

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"‚ö° Using device: {device}")

for i in range(len(eval_dataset)):
    example = eval_dataset[i]

    # Move tensors to device
    input_ids = example['input_ids'].unsqueeze(0).to(device)
    attention_mask = example['attention_mask'].unsqueeze(0).to(device)

    # Generate summary - MATCHING YOUR INFERENCE PARAMETERS
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            min_length=20,
            num_beams=4,  # Changed from 5 to match your training
            early_stopping=True,
            no_repeat_ngram_size=3
        )

    # Decode prediction
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    reference = example['reference']

    all_predictions.append(prediction)
    all_references.append(reference)

    # Print first few examples
    if i < 3:
        print(f"\n{'='*50}")
        print(f"üìã EXAMPLE {i+1} - {example['file_name']}")
        print(f"{'='*50}")
        print(f"üìÑ Reference Summary:\n{reference}")
        print(f"\nü§ñ Generated Summary:\n{prediction}")
        print(f"{'='*50}")

# Calculate ROUGE scores
print("\nüìà Calculating ROUGE scores...")
rouge_results = rouge.compute(
    predictions=all_predictions,
    references=all_references,
    use_stemmer=True
)

print("\n" + "="*60)
print("üéØ ROUGE EVALUATION RESULTS")
print("="*60)
print(f"ROUGE-1:   {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2:   {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L:   {rouge_results['rougeL']:.4f}")
print(f"ROUGE-Lsum: {rouge_results['rougeLsum']:.4f}")
print(f"Samples evaluated: {len(all_predictions)}")

# Save detailed results
results = {
    "model_name": "distilbart-anchored-email",
    "evaluation_date": str(pd.Timestamp.now()),
    "rouge_scores": {k: float(v) for k, v in rouge_results.items()},
    "samples_evaluated": len(all_predictions),
    "sample_predictions": [
        {
            "file": eval_dataset[i]['file_name'],
            "reference": all_references[i],
            "prediction": all_predictions[i]
        }
        for i in range(min(5, len(all_predictions)))
    ]
}

# Save to file
results_file = "rouge_evaluation_results.json"
with open(results_file, "w") as f:
    json.dump(results, f, indent=2)

print(f"\nüíæ Detailed results saved to: {results_file}")

# Save to Google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    drive_path = "/content/drive/MyDrive/email_summarization_evaluation"
    os.makedirs(drive_path, exist_ok=True)

    import shutil
    shutil.copy(results_file, os.path.join(drive_path, results_file))
    print(f"üíæ Also saved to Google Drive: {drive_path}")
except:
    print("‚ö†Ô∏è Running locally - results saved to current directory only")


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


Downloading builder script: 0.00B [00:00, ?B/s]

üìä Loading 20 files for evaluation...

üìä Ready to evaluate on 17 samples...
‚ö° Using device: cpu

üìã EXAMPLE 1 - lavorato-j_inbox225.json
üìÑ Reference Summary:
Louise received an update from Faith that they will drop in the revised commercial expense numbers for the 2002 plan schedule, which is coming tomorrow.

ü§ñ Generated Summary:
The email thread announces that the revised commercial expense numbers will be released tomorrow, with a new one coming in the coming day.

üìã EXAMPLE 2 - bass-e_inbox65.json
üìÑ Reference Summary:
Timothy and Eric are coordinating a basketball game. Eric can't play and suggests finding someone else. Timothy asks about Hull as a possible player. Eric confirms they need a fourth player.

ü§ñ Generated Summary:
The email thread discusses the need for a fourth person for the event, with a suggestion that CAN't play and a suggestion to pick someone up at the location of the event. Chris confirms he will see Chris and will see him there.

üìã E

In [10]:
!pip install evaluate bert-score -q

import evaluate
import json
import os
import pandas as pd

print("============================================================")
print("ü§ñ BERT SCORE EVALUATION")
print("============================================================")

# Load BERTScore metric
bertscore = evaluate.load("bertscore")

print(f"\nüìä Calculating BERTScore for {len(all_predictions)} samples...")

# Calculate BERTScore without the problematic 'num_threads' argument
# Specifying language and model_type for consistency and robustness
bert_results = bertscore.compute(
    predictions=all_predictions,
    references=all_references,
    lang="en",  # Specify language
    model_type="distilbert-base-uncased" # Use a suitable BERT model
)

print("\n" + "="*60)
print("üéØ BERTScore EVALUATION RESULTS")
print("="*60)
# BERTScore returns precision, recall, and f1 for each sample.
# Calculate and print the average scores.
print(f"BERTScore Precision: {sum(bert_results['precision']) / len(bert_results['precision']):.4f}")
print(f"BERTScore Recall:    {sum(bert_results['recall']) / len(bert_results['recall']):.4f}")
print(f"BERTScore F1:        {sum(bert_results['f1']) / len(bert_results['f1']):.4f}")
print(f"Samples evaluated: {len(all_predictions)}")

# Optional: Save BERTScore results
results_bertscore = {
    "model_name": "distilbart-anchored-email",
    "evaluation_date": str(pd.Timestamp.now()),
    "bertscore_scores": {
        "precision": float(sum(bert_results['precision']) / len(bert_results['precision'])),
        "recall": float(sum(bert_results['recall']) / len(bert_results['recall'])),
        "f1": float(sum(bert_results['f1']) / len(bert_results['f1']))
    },
    "samples_evaluated": len(all_predictions),
}

bertscore_results_file = "bertscore_evaluation_results.json"
with open(bertscore_results_file, "w") as f:
    json.dump(results_bertscore, f, indent=2)

print(f"\nüíæ Detailed BERTScore results saved to: {bertscore_results_file}")

# Save to Google Drive
try:
    from google.colab import drive
    # Assuming drive is already mounted from previous cells
    drive_path = "/content/drive/MyDrive/email_summarization_evaluation"
    os.makedirs(drive_path, exist_ok=True) # Ensure directory exists

    import shutil
    shutil.copy(bertscore_results_file, os.path.join(drive_path, bertscore_results_file))
    print(f"üíæ Also saved BERTScore results to Google Drive: {drive_path}")
except Exception as e:
    print(f"‚ö†Ô∏è Could not save BERTScore results to Google Drive: {e}")
    print("‚ö†Ô∏è Running locally - BERTScore results saved to current directory only")

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.1/61.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
ü§ñ BERT SCORE EVALUATION


Downloading builder script: 0.00B [00:00, ?B/s]


üìä Calculating BERTScore for 17 samples...


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertModel LOAD REPORT from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_transform.bias    | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



üéØ BERTScore EVALUATION RESULTS
BERTScore Precision: 0.8235
BERTScore Recall:    0.8378
BERTScore F1:        0.8303
Samples evaluated: 17

üíæ Detailed BERTScore results saved to: bertscore_evaluation_results.json
üíæ Also saved BERTScore results to Google Drive: /content/drive/MyDrive/email_summarization_evaluation


# Task
Install necessary libraries, load the BERTScore metric, and compute BERTScore using the previously generated predictions and references, then display the results.

## bertscore_evaluation

### Subtask:
Install necessary libraries, load the BERTScore metric, and compute BERTScore using the previously generated predictions and references.


## Summary:

### Data Analysis Key Findings
The subtask focused on preparing for the BERTScore evaluation. This involved installing the necessary libraries and loading the BERTScore metric. The process outlines the steps to compute BERTScore using previously generated predictions and references, but the actual computation and display of results have not yet occurred in the provided steps.

### Insights or Next Steps
*   The primary next step is to execute the BERTScore computation using the prepared metric and the available predictions and references.
*   Once computed, the BERTScore results will provide an automated and robust evaluation of the semantic similarity between the generated text and the reference text, which is crucial for assessing the quality of text generation models.
