In [1]:
!pip install transformers torch datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("wcukierski/enron-email-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/wcukierski/enron-email-dataset?dataset_version_number=2...


100%|██████████| 358M/358M [00:02<00:00, 150MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/wcukierski/enron-email-dataset/versions/2


In [6]:
import pandas as pd

# Load emails from the CSV file in the dataset directory
data_path = "/root/.cache/kagglehub/datasets/wcukierski/enron-email-dataset/versions/2/emails.csv"
df = pd.read_csv(data_path)

# Inspect the first few rows to understand the structure
print(df.head())
print(df.shape)



                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...
(517401, 2)


In [3]:
def prepare_summarization_data(text):
    return f"summarize: {text}"

df['input_text'] = df['message'].apply(prepare_summarization_data)
df['target_text'] = df['message'].apply(lambda x: x[:int(len(x) * 0.5)])
dataset = df[['input_text', 'target_text']]


In [10]:
from transformers import T5Tokenizer

# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Function to tokenize data
def tokenize_data(example):
    input_encodings = tokenizer(example['input_text'], truncation=True, padding="max_length", max_length=512)
    target_encodings = tokenizer(example['target_text'], truncation=True, padding="max_length", max_length=50)
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

# Select the first 1000 data points
dataset_subset = dataset[:10000]

# Apply tokenization
tokenized_dataset = dataset_subset.apply(tokenize_data, axis=1)


In [12]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True  # For mixed precision training on GPU
)




In [13]:
test_email = "Long email text here for testing..."
inputs = tokenizer(f"summarize: {test_email}", return_tensors="pt")
summary_ids = model.generate(inputs['input_ids'], max_length=50, length_penalty=0.8, early_stopping=True)
summarized_email = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summarized Email:", summarized_email)




Summarized Email: email text for testing...


In [14]:
def footprint_score(text):
    return len(text.encode('utf-8'))

print("Footprint score:", footprint_score(summarized_email))

Footprint score: 25


In [15]:
test_email = """Subject: Project Update and Next Steps Body:

Dear Team,

I hope this message finds you well. I wanted to provide an update on the recent developments in our project and outline the next steps for everyone involved. We’ve made considerable progress over the last few weeks, and I’d like to thank each of you for your dedication and hard work.

First, we have successfully completed Phase 1, which included market research and initial data collection. Our findings have shown promising insights that will inform the strategies we adopt in the subsequent phases. The data indicates a clear demand for our proposed solution, and it has helped us refine our target audience.

Moving into Phase 2, our immediate objectives are as follows:

Refine the product prototype based on the feedback gathered during Phase 1.
Conduct a series of usability tests to identify potential improvements.
Collaborate with the marketing team to begin outlining our initial outreach campaign.
For the usability tests, I’d like to remind everyone to document their findings in the shared project folder. Please include any relevant screenshots, participant feedback, and usability scores, as this will be invaluable for our development team. The testing phase is scheduled to last for two weeks, starting next Monday.

Additionally, I want to ensure that everyone is on the same page regarding our upcoming deadlines. Here’s a quick rundown of our timeline:

Prototype Refinement: Complete by the end of next week.
Usability Testing: Conducted over the following two weeks.
Initial Outreach Campaign Plan: Draft ready for review by the end of the month.
Please let me know if you have any questions or need further clarification on any of the points mentioned. I appreciate all your efforts and am confident that we’re on the right path to achieve our goals. Let’s continue to work together and keep up the momentum!

Thank you, and looking forward to our next meeting on Friday.

Best regards,
[Your Name]
Project Manager """
inputs = tokenizer(f"summarize: {test_email}", return_tensors="pt")
summary_ids = model.generate(inputs['input_ids'], max_length=50, length_penalty=0.8, early_stopping=True)
summarized_email = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summarized Email:", summarized_email)


Summarized Email: we have completed Phase 1, which included market research and initial data collection. the data indicates a clear demand for our proposed solution.


In [16]:
print("Footprint score:", footprint_score(summarized_email))

Footprint score: 147


In [17]:
model.save_pretrained("/content/t5_model")
tokenizer.save_pretrained("/content/t5_model")

('/content/t5_model/tokenizer_config.json',
 '/content/t5_model/special_tokens_map.json',
 '/content/t5_model/spiece.model',
 '/content/t5_model/added_tokens.json')

In [18]:
!zip -r t5_model.zip /content/t5_model

  adding: content/t5_model/ (stored 0%)
  adding: content/t5_model/added_tokens.json (deflated 83%)
  adding: content/t5_model/config.json (deflated 62%)
  adding: content/t5_model/special_tokens_map.json (deflated 85%)
  adding: content/t5_model/spiece.model (deflated 48%)
  adding: content/t5_model/generation_config.json (deflated 30%)
  adding: content/t5_model/tokenizer_config.json (deflated 94%)
  adding: content/t5_model/model.safetensors (deflated 53%)


In [19]:
from google.colab import files
files.download("t5_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>