In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
import torch
from transformers import pipeline, PegasusForConditionalGeneration, PegasusTokenizer
import pandas as pd

# Load the model and tokenizer
model_name = "google/pegasus-cnn_dailymail"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Initialize the summarization pipeline with GPU support
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
  # 0 for GPU

def summarize_text(text, prompt="Summarize the following email by focusing on the main actions, suggestions, and key points for brevity and clarity:"):
    """
    Summarize the given text using the PEGASUS model with a prompt.

    Args:
        text (str): The input text to summarize.
        prompt (str): The prompt to guide the summarization task.

    Returns:
        str: The summarized text.
    """
    # Combine the prompt with the input text
    input_text = f"{prompt}\n\n{text}"

    # Tokenize the input text and truncate it to 1024 tokens
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=1024)

    # Only pass the raw text to the pipeline, not the tokenized tensor
    summary = summarizer(input_text, max_length=150, min_length=50, do_sample=False)

    # Clean and return the summary text
    cleaned_summary = summary[0]['summary_text'].replace('<n>', ' ').strip()

    return cleaned_summary

# Example usage
input_text = """
Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>
Date: Fri, 4 May 2001 13:51:00 -0700 (PDT)
From: phillip.allen@enron.com
To: john.lavorato@enron.com
Subject: Re:
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>
X-cc:
X-bcc:
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.

My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.
"""

# Print the summary with the prompt
print(summarize_text(input_text))


John Lavorato replies to Phillip Allen's email about a business trip . Allen suggests holding the business plan meetings here then take a trip without any formal business meetings .  Lavorato suggests playing golf and renting a ski boat and jet ski's .


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import warnings

# Set random seed for reproducibility


# Load the model and tokenizer
model_name_phi3 = "microsoft/Phi-3-mini-128k-instruct"
model_phi3 = AutoModelForCausalLM.from_pretrained(model_name_phi3, torch_dtype="auto", trust_remote_code=True)
tokenizer_phi3 = AutoTokenizer.from_pretrained(model_name_phi3)

# Disable parallel tokenizer warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Define the pipeline
pipe_phi3 = pipeline(
    "text-generation",
    model=model_phi3,
    tokenizer=tokenizer_phi3,
)
# Function to generate a paragraph-style summary
def generate_summary_phi3(input_text, prompt="Summarize the email in a concise paragraph:"):
    """
    Generates a paragraph-style summary based on the given input text.

    Args:
        input_text (str): The input text (e.g., an email).
        prompt (str): The task prompt to guide the model.

    Returns:
        str: The generated paragraph-style summary or description.
    """
    # Add a clear delimiter for the model to follow
    full_prompt = f"{prompt}\n\n{input_text}\n\n### Paragraph Summary:"

    try:
        # Generate text using the pipeline
        result = pipe_phi3(
            full_prompt,
            max_new_tokens=200,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7,
        )
        generated_text = result[0]["generated_text"]

        # Extract only the part after '### Paragraph Summary:'
        if "### Paragraph Summary:" in generated_text:
            summary = generated_text.split("### Paragraph Summary:")[1].strip()
        else:
            summary = generated_text.strip()  # Fallback if no delimiter is found

        # Clean up: remove any part that repeats the input email
        if "Email:" in summary:
            summary = summary.split("Email:")[0].strip()

        return summary
    except Exception as e:
        return f"Error during generation: {e}"

# Example usage
input_text = """
Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>
Date: Fri, 4 May 2001 13:51:00 -0700 (PDT)
From: phillip.allen@enron.com
To: john.lavorato@enron.com
Subject: Re:
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>
X-cc:
X-bcc:
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Traveling to have a business meeting takes the fun out of the trip. Especially if you have to prepare a presentation. I would suggest holding the business plan meetings here then take a trip without any formal business meetings. I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not. Too often the presenter speaks and the others are quiet just waiting for their turn. The meetings might be better if held in a round table discussion format.

My suggestion for where to go is Austin. Play golf and rent a ski boat and jet ski's. Flying somewhere takes too much time.
"""

# Generate the paragraph-style summary
description = generate_summary_phi3(input_text)

# Print only the summary
print(description)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load the h2o-danube3-4b-chat model
model_name_h2o = "h2oai/h2o-danube3-4b-chat"
tokenizer_h2o = AutoTokenizer.from_pretrained(model_name_h2o)
model_h2o = AutoModelForCausalLM.from_pretrained(model_name_h2o)

# Create a text generation pipeline
pipe_h2o = pipeline("text-generation", model=model_h2o, tokenizer=tokenizer_h2o)

# Function to generate a single-paragraph summary
def generate_summary_h2o(input_text, prompt="Summarize the email in a concise paragraph:"):
    """
    Generates a single-paragraph summary based on the given input text.

    Args:
        input_text (str): The input text (e.g., an email).
        prompt (str): The task prompt to guide the model.

    Returns:
        str: The generated single-paragraph summary.
    """
    # Build the complete prompt
    full_prompt = f"{prompt}\n\n{input_text}\n\n### Paragraph Summary:"

    try:
        # Generate text using the pipeline
        result = pipe_h2o(
            full_prompt,
            max_new_tokens=150,  # Adjust length for a concise paragraph
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7,
        )
        generated_text = result[0]["generated_text"]

        # Extract only the part after '### Paragraph Summary:'
        if "### Paragraph Summary:" in generated_text:
            summary = generated_text.split("### Paragraph Summary:")[1].strip()
        else:
            summary = generated_text.strip()  # Fallback if no delimiter is found

        # Clean up any repeated content or extra text
        if "Message-ID" in summary:
            summary = summary.split("Message-ID")[0].strip()

        return summary
    except Exception as e:
        return f"Error during generation: {e}"

# Example usage
input_text = """
Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>
Date: Fri, 4 May 2001 13:51:00 -0700 (PDT)
From: phillip.allen@enron.com
To: john.lavorato@enron.com
Subject: Re:
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>
X-cc:
X-bcc:
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Traveling to have a business meeting takes the fun out of the trip. Especially if you have to prepare a presentation. I would suggest holding the business plan meetings here then take a trip without any formal business meetings. I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not. Too often the presenter speaks and the others are quiet just waiting for their turn. The meetings might be better if held in a round table discussion format.

My suggestion for where to go is Austin. Play golf and rent a ski boat and jet ski's. Flying somewhere takes too much time.
"""

# Generate the paragraph summary
paragraph_summary = generate_summary_h2o(input_text)

# Print only the summary
print(paragraph_summary)


In [None]:
data['resume'] = data['message'].apply(summarize_text)


In [None]:
data['resume_h2o'] = data['message'].apply(generate_summary_h2o)


In [None]:
data['resume_phi3'] = data['message'].apply(generate_summary_phi3)


In [None]:
data

Unnamed: 0,file,message,resume
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Summarize the following email by focusing on t...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,I would suggest holding the business plan meet...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,Summarize the following email by focusing on t...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,Summarize the following email by focusing on t...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Summarize the following email by focusing on t...
...,...,...,...
95,allen-p/_sent_mail/180.,Message-ID: <29919154.1075855689201.JavaMail.e...,I think crude price are undervalued by the tun...
96,allen-p/_sent_mail/181.,Message-ID: <4511963.1075855689223.JavaMail.ev...,Summarize the following email by focusing on t...
97,allen-p/_sent_mail/182.,Message-ID: <33111317.1075855689245.JavaMail.e...,"""Lucy Gonzalez"": ""The a/c I bought today for #..."
98,allen-p/_sent_mail/183.,Message-ID: <1665326.1075855689266.JavaMail.ev...,Summarize the following email by focusing on t...
