In [None]:
!pip install transformers torch accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.1/59.1 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1


In [None]:
from huggingface_hub import login

# When you run this, a box will appear. Paste your token there.
login()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


TRAINING THE MODEL

In [None]:
import os
import json
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# --- 1. DATA PREPARATION ---

class EmailThreadDataset(Dataset):
    def __init__(self, json_folder, txt_folder, tokenizer, max_input=512, max_target=128):
        self.examples = []

        # Verify directory existence
        if not os.path.exists(json_folder) or not os.path.exists(txt_folder):
            print(f"Warning: Check your paths. JSON: {json_folder}, TXT: {txt_folder}")
            return

        json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]

        print(f"Loading {len(json_files)} files for Standard Summarization...")
        for f_name in json_files:
            try:
                # Load JSON conversation data
                with open(os.path.join(json_folder, f_name), 'r') as f:
                    data = json.load(f)

                # Format thread text
                input_text = self.format_thread_input(data)

                # Match naming: filename.json -> filename._summary.txt
                txt_name = f_name.replace('.json', '._summary.txt')
                txt_path = os.path.join(txt_folder, txt_name)

                if os.path.exists(txt_path):
                    with open(txt_path, 'r') as f:
                        summary_text = f.read().strip()

                    # Tokenization
                    model_inputs = tokenizer(
                        input_text,
                        max_length=max_input,
                        truncation=True,
                        padding="max_length"
                    )

                    labels = tokenizer(
                        text_target=summary_text,
                        max_length=max_target,
                        truncation=True,
                        padding="max_length"
                    )

                    model_inputs["labels"] = labels["input_ids"]
                    self.examples.append(model_inputs)

            except Exception as e:
                print(f"Error processing {f_name}: {e}")

    def format_thread_input(self, data):
        """Extracts and cleans thread text from 'sentences' field."""
        all_turns = data.get('sentences', [])
        thread_text = ""

        for i, turn_sentences in enumerate(all_turns):
            if not turn_sentences:
                continue

            # Handle list of words vs list of list of words
            if isinstance(turn_sentences[0], list):
                turn_text = " ".join([" ".join(s) for s in turn_sentences])
            else:
                turn_text = " ".join(turn_sentences)

            thread_text += f"Email {i}: {turn_text} "

        # BART performs best with the "summarize: " prefix
        return f"summarize: {thread_text.strip()}"

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return {key: torch.tensor(val) for key, val in self.examples[i].items()}

# --- 2. INITIALIZATION ---

model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# --- 3. TRAINING ---

dataset = EmailThreadDataset(
    json_folder='/content/drive/MyDrive/NLP/json',
    txt_folder='/content/drive/MyDrive/NLP/openai_summaries',
    tokenizer=tokenizer
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./distilbart-standard-email-no-triggers",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    fp16=torch.cuda.is_available(),
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="no",
    predict_with_generate=True,
    generation_max_length=128,
    weight_decay=0.01,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

if len(dataset) > 0:
    print("Starting training...")
    trainer.train()
    trainer.save_model("./final_standard_model")
    tokenizer.save_pretrained("./final_standard_model")
    trainer.push_to_hub()
    print("Training complete and model saved.")
else:
    print("Training aborted: No valid examples found.")

Loading weights:   0%|          | 0/358 [00:00<?, ?it/s]



Loading 1200 files for Standard Summarization...
Error processing ybarbo-p_inbox_364.json: sequence item 8: expected str instance, list found
Starting training...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
10,8.951207
20,3.218294
30,1.478009
40,1.163879
50,1.06707
60,1.099074
70,1.125476
80,1.024181
90,0.9048
100,0.912608


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...riggers/model.safetensors:   0%|          |  131kB / 1.63GB            

  ...riggers/training_args.bin:  35%|###5      | 1.89kB / 5.33kB            

Training complete and model saved.


LOADING TRAINED MODEL

In [10]:
import torch
import os
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig

# --- 1. CONFIGURATION ---
model_path_hub = "youyou354/distilbart-standard-email-no-triggers"
local_save_path = "./final_standard_model"
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- 2. CONDITIONAL MODEL LOADING ---

# Check RAM for active 'trainer' variable
current_trainer = globals().get('trainer')
current_tokenizer = globals().get('tokenizer')

if current_trainer is not None and hasattr(current_trainer, 'model'):
    print("üí° Using existing trained model and tokenizer from the current session.")
    model = current_trainer.model
    tokenizer = current_tokenizer

else:
    # Determine if we load from local disk or the Hub
    if os.path.exists(local_save_path):
        load_path = local_save_path
        print(f"üìÅ Model found locally. Loading from: {load_path}")
    else:
        load_path = model_path_hub
        print(f"üåç Model not found locally. Loading from Hub: {load_path}")

    # --- THE WEIGHT FIX ---
    # Load config and force tie_word_embeddings=True to fix the gibberish issue
    config = AutoConfig.from_pretrained(load_path)
    config.tie_word_embeddings = True

    tokenizer = AutoTokenizer.from_pretrained(load_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(load_path, config=config)

    # Manually ensure the language head uses the trained vocabulary weights
    model.lm_head.weight = model.model.shared.weight

# Final setup
model.to(device)
model.eval()
print(f"‚úÖ Model ready on {device}.")

üåç Model not found locally. Loading from Hub: youyou354/distilbart-standard-email-no-triggers


Loading weights:   0%|          | 0/358 [00:00<?, ?it/s]



‚úÖ Model ready on cuda.


DEMO

In [11]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def generate_standard_summary(json_data, model, tokenizer):
    """
    Generates a summary for the NO-TRIGGERS model version.
    Input format: 'summarize: Email 0: ... Email 1: ...'
    """

    # 1. Extract Thread (Matches the 'No Triggers' Training Logic)
    all_turns = json_data.get('sentences', [])
    thread_text = ""

    for i, turn in enumerate(all_turns):
        if not turn:
            continue

        # Handle list of words vs list of list of words
        if isinstance(turn[0], list):
            turn_text = " ".join([" ".join(s) for s in turn])
        else:
            turn_text = " ".join(turn)

        thread_text += f"Email {i}: {turn_text} "

    # 2. Final Prompt Construction (No anchor_str or THREAD tag)
    # This matches the 'format_thread_input' method in your training script
    final_input = f"summarize: {thread_text.strip()}"

    # Optional: print(f"Input to model: {final_input}")

    # 3. Manual Inference
    device = "cuda" if torch.cuda.is_available() else "cpu"
    inputs = tokenizer(
        final_input,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            num_beams=5,           # Consistent with your anchored version
            max_length=128,
            min_length=20,
            no_repeat_ngram_size=3 # Prevents repetitive phrasing
        )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [12]:
test_data = json.load(open('/content/drive/MyDrive/NLP/test/germany-c_inbox223.json'))
print(generate_standard_summary(test_data, model, tokenizer))

Linda sent the confirm for deal 415547 and contract no. 96017703 for deal 241639, effective 11/1/2001-10/2002, and asked Brenda to send a copy of the contract or confirm for the deal 24/1639. Elizabeth is out of the office and Chris Germany forwarded the contract details to Stacey, who checked the deal in Sitara, but she is having problems with Live Link and may need to check with Ellen for confirmation. Bryan will enter the demand charge as a negative demand charge and asks Elizabeth to get a copy.


EVALUATION

In [13]:
# 1. INSTALL DEPENDENCIES
!pip install pandas evaluate rouge_score nltk -q

import evaluate
import json
import os
import torch
import nltk
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig
import pandas as pd

# 2. NLTK SETUP (Ensures ROUGE-L sentence splitting works)
try:
    nltk.download('punkt')
    nltk.download('punkt_tab')
except Exception as e:
    print(f"NLTK Download warning: {e}")

# 3. INITIALIZE ROUGE
rouge = evaluate.load("rouge")

# 4. DATASET CLASS (Thread-Only Logic)
class EvalDataset(Dataset):
    def __init__(self, json_folder, txt_folder, tokenizer, max_input=512, max_samples=50):
        self.examples = []
        if not os.path.exists(json_folder):
            print(f"‚ùå Error: Folder {json_folder} not found.")
            return

        json_files = sorted([f for f in os.listdir(json_folder) if f.endswith('.json')])
        json_files = json_files[:max_samples]

        print(f"üìä Loading {len(json_files)} files for Thread-Only evaluation...")
        for f_name in json_files:
            try:
                with open(os.path.join(json_folder, f_name), 'r', encoding='utf-8') as f:
                    data = json.load(f)

                # Format input using ONLY thread text (Matches your No-Triggers training)
                input_text = self.format_thread_input(data)

                # Match naming: filename.json -> filename._summary.txt
                txt_name = f_name.replace('.json', '._summary.txt')
                txt_path = os.path.join(txt_folder, txt_name)

                if os.path.exists(txt_path):
                    with open(txt_path, 'r', encoding='utf-8') as f:
                        summary_text = f.read().strip()

                    # Dynamic tokenization (no fixed padding here matches manual test logic)
                    inputs = tokenizer(
                        input_text,
                        max_length=max_input,
                        truncation=True,
                        return_tensors="pt"
                    )

                    self.examples.append({
                        'input_ids': inputs['input_ids'].squeeze(),
                        'attention_mask': inputs['attention_mask'].squeeze(),
                        'reference': summary_text,
                        'file_name': f_name
                    })

            except Exception as e:
                print(f"‚ö†Ô∏è Error processing {f_name}: {str(e)[:50]}...")

    def format_thread_input(self, data):
        """Extracts only the thread text (Strictly No Anchors/Triggers)."""
        all_turns = data.get('sentences', [])
        thread_text = ""
        for i, turn_sentences in enumerate(all_turns):
            if not turn_sentences: continue

            # Handle list of words vs list of list of words
            if isinstance(turn_sentences[0], list):
                turn_text = " ".join([" ".join(s) for s in turn_sentences])
            else:
                turn_text = " ".join(turn_sentences)

            thread_text += f"Email {i}: {turn_text} "

        return f"summarize: {thread_text.strip()}"

    def __len__(self): return len(self.examples)
    def __getitem__(self, idx): return self.examples[idx]

# 5. EXECUTION LOGIC
# Ensure model is in eval mode and on device
model.eval()
model.to(device)

# Create evaluation dataset
eval_dataset = EvalDataset(
    json_folder='/content/drive/MyDrive/NLP/test',      # Your JSON test folder
    txt_folder='/content/drive/MyDrive/NLP/openai_summaries', # Your references
    tokenizer=tokenizer,
    max_samples=20
)

all_predictions = []
all_references = []
all_files = []

print(f"‚ö° Using device: {device}\nüöÄ Starting Evaluation...")

for i in range(len(eval_dataset)):
    example = eval_dataset[i]
    input_ids = example['input_ids'].unsqueeze(0).to(device)
    attention_mask = example['attention_mask'].unsqueeze(0).to(device)

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            min_length=20,
            num_beams=5,           # Match your successful manual test
            early_stopping=True,
            no_repeat_ngram_size=3
        )

    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    all_predictions.append(prediction)
    all_references.append(example['reference'])
    all_files.append(example['file_name'])

    if i < 3: # Visual check for the first few examples
        print(f"\n{'='*50}\nüìã EXAMPLE {i+1} - {example['file_name']}\n{'='*50}")
        print(f"üìÑ Reference:\n{example['reference'][:200]}...")
        print(f"\nü§ñ Generated Summary:\n{prediction}\n{'='*50}")

# 6. CALCULATE ROUGE
print("\nüìà Calculating ROUGE scores...")
rouge_results = rouge.compute(
    predictions=all_predictions,
    references=all_references,
    use_stemmer=True
)

print("\n" + "="*60)
print("üéØ ROUGE EVALUATION RESULTS (NO-TRIGGERS)")
print("="*60)
for k, v in rouge_results.items():
    print(f"{k.upper():<10}: {v:.4f}")

# 7. SAVE DETAILED RESULTS
results_summary = {
    "model_name": "distilbart-no-triggers",
    "evaluation_date": str(pd.Timestamp.now()),
    "rouge_scores": {k: float(v) for k, v in rouge_results.items()},
    "samples_evaluated": len(all_predictions)
}

results_file = "rouge_no_triggers_results.json"
with open(results_file, "w") as f:
    json.dump(results_summary, f, indent=2)

# Save side-by-side CSV and move to Drive
try:
    drive_path = "/content/drive/MyDrive/email_summarization_evaluation"
    os.makedirs(drive_path, exist_ok=True)

    df = pd.DataFrame({"file": all_files, "reference": all_references, "prediction": all_predictions})
    df.to_csv(os.path.join(drive_path, "comparison_no_triggers.csv"), index=False)

    import shutil
    shutil.copy(results_file, os.path.join(drive_path, results_file))
    print(f"\nüíæ Results fully synced to Drive: {drive_path}")
except Exception as e:
    print(f"\n‚ö†Ô∏è Saved results to local directory only. (Drive error: {e})")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


üìä Loading 20 files for Thread-Only evaluation...
‚ö° Using device: cuda
üöÄ Starting Evaluation...

üìã EXAMPLE 1 - bass-e_inbox100.json
üìÑ Reference:
Brenda Herod is organizing a lunch with Eric, Timothy, and Dave. Dave is available tomorrow, and Brenda and Mr. Hull are also available then. She asks if tomorrow is too early or if they should plan f...

ü§ñ Generated Summary:
David is in for tomorrow and asks if tomorrow is too early for him or if next week. Mr. Hull and David are available tomorrow, and they will have to go dutch or someone with their bonuses will pick up the lunch.

üìã EXAMPLE 2 - bass-e_inbox124.json
üìÑ Reference:
Karen Buckley requests Eric Bass to come to 5C2 tomorrow to complete missing information on his application form for UBS. Eric asks whether the location is the old or new Enron building....

ü§ñ Generated Summary:
Eric Bass asked if the old Enron building is the new one. Karen Buckley asked Eric to come by 5C2 at his convenience tomorrow to co

In [14]:
# --- BERT SCORE EVALUATION (Thread-Only) ---
!pip install bert-score -q

import evaluate
import json
import os
import pandas as pd
import torch

print("============================================================")
print("ü§ñ BERT SCORE EVALUATION (THREAD-ONLY)")
print("============================================================")

# 1. Load BERTScore metric
bertscore = evaluate.load("bertscore")

# Ensure we have predictions to evaluate
if 'all_predictions' in locals() and len(all_predictions) > 0:
    print(f"üìä Calculating BERTScore for {len(all_predictions)} samples...")

    # 2. Calculate BERTScore
    # Using 'distilbert-base-uncased' for speed, or 'microsoft/deberta-xlarge-mnli' for max accuracy
    bert_results = bertscore.compute(
        predictions=all_predictions,
        references=all_references,
        lang="en",
        model_type="distilbert-base-uncased",
        device="cuda" if torch.cuda.is_available() else "cpu"
    )

    # 3. Calculate Averages
    avg_precision = sum(bert_results['precision']) / len(bert_results['precision'])
    avg_recall = sum(bert_results['recall']) / len(bert_results['recall'])
    avg_f1 = sum(bert_results['f1']) / len(bert_results['f1'])

    print("\n" + "="*60)
    print("üéØ BERTScore EVALUATION RESULTS")
    print("="*60)
    print(f"Precision: {avg_precision:.4f}")
    print(f"Recall:    {avg_recall:.4f}")
    print(f"F1 Score:  {avg_f1:.4f}")
    print(f"Samples evaluated: {len(all_predictions)}")

    # 4. Save Results
    results_bertscore = {
        "model_name": "distilbart-thread-only",
        "evaluation_date": str(pd.Timestamp.now()),
        "bertscore_averages": {
            "precision": float(avg_precision),
            "recall": float(avg_recall),
            "f1": float(avg_f1)
        },
        "samples_evaluated": len(all_predictions)
    }

    results_file = "bertscore_results.json"
    with open(results_file, "w") as f:
        json.dump(results_bertscore, f, indent=2)

    # 5. Move to Google Drive
    try:
        drive_path = "/content/drive/MyDrive/email_summarization_evaluation"
        os.makedirs(drive_path, exist_ok=True)
        import shutil
        shutil.copy(results_file, os.path.join(drive_path, results_file))
        print(f"\nüíæ Saved to Drive: {drive_path}/{results_file}")
    except Exception as e:
        print(f"\n‚ö†Ô∏è Saved locally only: {e}")

else:
    print("‚ùå Error: 'all_predictions' not found. Please run the evaluation cell first.")

ü§ñ BERT SCORE EVALUATION (THREAD-ONLY)
üìä Calculating BERTScore for 18 samples...


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertModel LOAD REPORT from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



üéØ BERTScore EVALUATION RESULTS
Precision: 0.8173
Recall:    0.8311
F1 Score:  0.8228
Samples evaluated: 18

üíæ Saved to Drive: /content/drive/MyDrive/email_summarization_evaluation/bertscore_results.json
