In [None]:
# Install necessary libraries
!pip install -q transformers[sentencepiece] datasets sacrebleu rouge_score py7zr evaluate
!pip install -q --upgrade accelerate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m82.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:0

In [None]:
# Import libraries
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch
from tqdm import tqdm
import nltk
import evaluate  # New import for evaluation metrics

nltk.download("punkt")

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and tokenizer
model_ckpt = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

# Load dataset, select 200 emails, and convert to DataFrame
ds = load_dataset("argilla/FinePersonas-Conversations-Email-Summaries")
df = pd.DataFrame(ds['train']).sample(n=200, random_state=42)[["email", "summary"]]  # Limit to 200 samples
print("Sample data (200 entries):")
print(df.head())

# Convert to Hugging Face dataset format
ds_subset = Dataset.from_pandas(df)

# Define tokenization function
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['email'], max_length=1024, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length=128, truncation=True, padding="max_length")
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

# Tokenize dataset
ds_tokenized = ds_subset.map(convert_examples_to_features, batched=True)

# Training setup
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer_args = TrainingArguments(
    output_dir='distilbart-finepersonas', report_to="none", num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10, evaluation_strategy='steps', eval_steps=50,
    save_steps=1e6, gradient_accumulation_steps=16
)
trainer = Trainer(
    model=model, args=trainer_args, tokenizer=tokenizer, data_collator=seq2seq_data_collator,
    train_dataset=ds_tokenized, eval_dataset=ds_tokenized
)

# Train the model
trainer.train()

# Define a function for evaluation using ROUGE
rouge_metric = evaluate.load("rouge")  # Updated to use evaluate library
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, batch_size=2, device=device, column_text="email", column_summary="summary"):
    article_batches = [dataset[i:i + batch_size]["email"] for i in range(0, len(dataset), batch_size)]
    summary_batches = [dataset[i:i + batch_size]["summary"] for i in range(0, len(dataset), batch_size)]

    for article_batch, target_batch in tqdm(zip(article_batches, summary_batches), total=len(article_batches)):
        inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=128)
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
    return metric.compute()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.03k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/145M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/146M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/146M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/363584 [00:00<?, ? examples/s]

Sample data (200 entries):
                                                    email  \
342340  Subject: Upcoming panel discussion on race and...   
259310  Subject: RE: Seeking advice on educational app...   
198413  Subject: RE: Exciting new discovery related to...   
115128  Subject: RE: New research on radiation shieldi...   
184840  Subject: RE: Cyanosis seminar content\n\nHi Em...   

                                                  summary  
342340  Dr. Imani Johnson is reaching out to collabora...  
259310  John is excited to offer feedback and insights...  
198413  Robert is enthusiastic about a new diary disco...  
115128  David thanks Emily for sharing an article on c...  
184840  Michael agrees to include information on cyano...  


Map:   0%|          | 0/200 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
# Calculate ROUGE score on the tokenized dataset
score = calculate_metric_on_test_ds(ds_tokenized, rouge_metric, trainer.model, tokenizer, batch_size=2, column_text="email", column_summary="summary")

# Adjusted to use float values directly
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = {rn: score[rn] for rn in rouge_names}  # Directly access the float score

# Display ROUGE scores
print("ROUGE scores:", pd.DataFrame(rouge_dict, index=['distilbart']))


100%|██████████| 100/100 [03:13<00:00,  1.94s/it]


ROUGE scores:               rouge1    rouge2    rougeL  rougeLsum
distilbart  0.404237  0.194335  0.307697   0.307189


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
save_directory = '/content/drive/MyDrive/distilbart text summarization'
# Paths for saving model and tokenizer
model_save_path = os.path.join(save_directory, "distilbart-finetuned-model")
tokenizer_save_path = os.path.join(save_directory, "distilbart-finetuned-tokenizer")

# Save model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model and tokenizer saved to {save_directory}")

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Model and tokenizer saved to /content/drive/MyDrive/distilbart text summarization


In [None]:


# Import libraries
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Set device to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the trained model and tokenizer from saved files
model_path = "/content/drive/MyDrive/distilbart text summarization/distilbart-finetuned-model"  # Path where your model was saved
tokenizer_path = "/content/drive/MyDrive/distilbart text summarization/distilbart-finetuned-tokenizer"  # Path where your tokenizer was saved

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

# Define a function to generate summaries
def generate_summary(email_text, min_length=30, max_length=100):
    # Tokenize the input text
    inputs = tokenizer(
        email_text,
        max_length=1024,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        # Generate summary with length control
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,  # Set max length of the summary
            min_length=min_length,  # Set min length of the summary
            num_beams=5,  # Beam search for better quality
            length_penalty=1.0,  # Adjust to control length preference
            early_stopping=True  # Stop early for more concise summaries
        )

    # Decode the generated summary
    summary = tokenizer.decode(
        generated_ids[0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    return summary



Using device: cuda


In [None]:
# Example email
example_email = """
Hi Team,

I hope this email finds you well. I wanted to update you on the progress of the current project. We have completed the initial design phase and are moving into development next week. Please ensure that all necessary resources are allocated accordingly.

Best regards,
Alice
"""

# Generate summary with specific length requirements
generated_summary = generate_summary(example_email, min_length=50, max_length=70)

print("\nGenerated Summary:")
print(generated_summary)


Generated Summary:
 We have completed the initial design phase and are moving into development next week. Please ensure that all necessary resources are allocated accordingly. Best regards, however, is that the project will be completed in the next two weeks. Back to Mail Online home.


In [None]:
# Example email
example_email = """
Artificial Intelligence (AI) is no longer a futuristic concept; it has become an integral part of our daily lives, shaping industries and revolutionizing the way we work, live, and interact. From smart assistants like Siri and Alexa to complex systems that power autonomous vehicles and medical diagnostics, AI is rapidly transforming the world as we know it. In this blog, we'll explore the power of AI, its potential, and the ethical considerations that come with its widespread adoption.

What is AI?
At its core, Artificial Intelligence refers to the ability of machines to perform tasks that typically require human intelligence. These tasks include problem-solving, learning, pattern recognition, decision-making, and natural language processing. AI systems are designed to mimic human cognition, but they can process information much faster and often more accurately, depending on the task.

AI can be classified into two types:

Narrow AI (Weak AI): This is the most common form of AI we encounter today. It is designed to handle a specific task, such as voice recognition, recommendation algorithms, or image processing. For instance, Netflix's recommendation engine or Google's search algorithm are examples of narrow AI.

General AI (Strong AI): Unlike narrow AI, general AI aims to perform any intellectual task that a human can do, demonstrating a broad level of cognitive abilities. While we have yet to achieve general AI, it remains a long-term goal for many researchers in the field.

How AI is Changing Industries
Healthcare: One of the most exciting applications of AI is in healthcare, where it has the potential to save lives and improve outcomes. AI-powered systems can analyze medical images, predict disease progression, and even suggest personalized treatment plans. For example, AI algorithms can detect early signs of diseases like cancer by analyzing X-rays or MRIs with incredible accuracy. Furthermore, AI is helping in drug discovery, speeding up the process of developing life-saving medications.

Finance: In the financial sector, AI is used to analyze market trends, predict stock movements, and identify fraudulent activities. Algorithmic trading has become a dominant force in the financial markets, with AI systems executing high-frequency trades faster than human traders. AI is also used in customer service, with chatbots handling inquiries and providing 24/7 support.

Manufacturing and Automation: AI has been transforming manufacturing processes by enabling automation in production lines. Robots powered by AI can work alongside humans to perform repetitive tasks, reducing human error and increasing efficiency. AI is also used for predictive maintenance, where machine learning algorithms predict when equipment is likely to fail, reducing downtime and repair costs.

Transportation: Autonomous vehicles are one of the most well-known applications of AI. Companies like Tesla, Waymo, and Uber are using AI to develop self-driving cars that can navigate roads without human intervention. AI systems process data from sensors and cameras to make decisions in real-time, ensuring the safety of passengers while reducing traffic congestion and accidents.

Customer Service and Retail: AI is transforming the way businesses interact with their customers. Chatbots, virtual assistants, and AI-driven recommendation systems are enhancing customer service experiences. For example, e-commerce platforms like Amazon use AI to recommend products based on previous purchases, browsing behavior, and user preferences, increasing sales and customer satisfaction.

The Ethical Considerations of AI
While AI offers immense benefits, it also raises several ethical concerns. Some of the most pressing issues include:

Job Displacement: One of the major concerns surrounding AI is the potential for job loss, particularly in industries like manufacturing, transportation, and customer service. As AI systems take over more tasks, workers may find themselves displaced, and there will be a need for reskilling and upskilling to ensure that the workforce can adapt to new roles.

Bias and Fairness: AI systems learn from data, and if the data used to train these systems is biased, the outcomes can be discriminatory. For example, facial recognition technology has been shown to be less accurate at identifying people of color, leading to concerns about racial bias in AI systems. Ensuring that AI algorithms are fair and unbiased is crucial for their widespread adoption.

Privacy Concerns: AI systems rely on vast amounts of data, and much of this data is personal. The use of AI in surveillance, social media, and other applications raises concerns about how personal data is collected, stored, and used. Striking a balance between innovation and privacy will be key to the ethical use of AI.

Autonomy and Accountability: As AI systems become more autonomous, questions arise about who is responsible when things go wrong. If an autonomous vehicle causes an accident, for example, who is to blame? Is it the manufacturer, the developer, or the AI itself? As AI takes on more decision-making responsibilities, legal and regulatory frameworks will need to adapt.

The Future of AI
The future of AI is incredibly exciting, with endless possibilities for innovation. We are likely to see further advancements in areas like natural language processing, robotics, and machine learning. However, with these advancements come challenges, particularly in terms of regulation, ethical use, and public trust.

Governments, companies, and researchers must work together to ensure that AI is developed and deployed responsibly. AI has the potential to improve lives, create new opportunities, and address some of the world’s most pressing challenges. But as with any powerful technology, it must be harnessed with caution and foresight.

Conclusion
Artificial Intelligence is already changing the world, and its potential for future impact is limitless. From healthcare to transportation, finance to retail, AI is driving innovation and transforming industries. As we continue to explore AI's capabilities, it's crucial that we also consider the ethical implications to ensure it benefits society as a whole. The future of AI is not just about technology—it’s about creating a world where machines and humans work together to achieve greater outcomes.

In the end, AI isn’t just a tool; it’s a partner in shaping the future. By embracing its possibilities responsibly, we can unlock a future where AI helps humanity thrive.
"""

# Generate summary with specific length requirements
generated_summary = generate_summary(example_email, min_length=30, max_length=50)

print("\nGenerated Summary:")
print(generated_summary)


Generated Summary:
 Artificial Intelligence refers to the ability of machines to perform tasks that typically require human intelligence. In the financial sector, AI is used to analyze market trends, predict stock movements, and identify fraudulent activities. One of the most exciting applications of AI
