In [1]:
import os
import json

# Base folder path
base_path = r"C:\Users\DELL\Downloads\maildir\allen-p"
folders_to_load = ["inbox", "notes_inbox", "sent", "sent_items"]

# Load emails
all_emails = []

for folder in folders_to_load:
    folder_path = os.path.join(base_path, folder)
    
    if os.path.exists(folder_path):
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            
            if os.path.isfile(file_path):
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    all_emails.append({
                        "folder": folder,
                        "filename": filename,
                        "content": content.strip()
                    })
    else:
        print(f"Folder {folder_path} not found.")

print(f"Total emails loaded: {len(all_emails)}")


Total emails loaded: 1021


In [2]:
alpaca_data = []

for email in all_emails:
    prompt = f"Email from {email['folder']} folder, filename: {email['filename']}. Read the content below:\n"
    completion = email['content'] + " END"
    
    alpaca_data.append({
        "prompt": prompt,
        "completion": completion
    })

# Save the dataset
output_file = r"C:\Users\DELL\Downloads\maildir\emails_alpaca_format.json"
with open(output_file, 'w', encoding='utf-8') as f_out:
    for entry in alpaca_data:
        json.dump(entry, f_out)
        f_out.write('\n')

print(f"Dataset saved to {output_file}")


Dataset saved to C:\Users\DELL\Downloads\maildir\emails_alpaca_format.json


In [18]:
!pip install torch transformers datasets





[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
python.exe -m pip install --upgrade pip

In [5]:
import sys
!{sys.executable} -m pip install torch transformers datasets


Collecting datasets
  Using cached datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Using cached datasets-4.0.0-py3-none-any.whl (494 kB)
Installing collected packages: datasets
Successfully installed datasets-4.0.0


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset('json', data_files=output_file, split='train')

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

print("First example:")
print(dataset[0])


Generating train split: 0 examples [00:00, ? examples/s]

First example:
{'prompt': 'Email from inbox folder, filename: 1. Read the content below:\n', 'completion': 'Message-ID: <16159836.1075855377439.JavaMail.evans@thyme>\nDate: Fri, 7 Dec 2001 10:06:42 -0800 (PST)\nFrom: heather.dunton@enron.com\nTo: k..allen@enron.com\nSubject: RE: West Position\nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Dunton, Heather </O=ENRON/OU=NA/CN=RECIPIENTS/CN=HDUNTON>\nX-To: Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Pallen>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\Inbox\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\n \nPlease let me know if you still need Curve Shift.\n\nThanks,\nHeather\n -----Original Message-----\nFrom: \tAllen, Phillip K.  \nSent:\tFriday, December 07, 2001 5:14 AM\nTo:\tDunton, Heather\nSubject:\tRE: West Position\n\nHeather,\n\nDid you attach the file to this email?\n\n -----Original Message-----\nFrom: \tDunton, Heather 

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class EmailDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=512):
        self.examples = []
        for entry in dataset:
            text = entry['prompt'] + entry['completion']
            tokenized = tokenizer(text, truncation=True, max_length=max_length, padding="max_length")
            self.examples.append({
                'input_ids': torch.tensor(tokenized['input_ids']),
                'attention_mask': torch.tensor(tokenized['attention_mask'])
            })
            
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return self.examples[idx]

email_dataset = EmailDataset(dataset, tokenizer)
data_loader = DataLoader(email_dataset, batch_size=1, shuffle=True)


In [5]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name)


In [6]:
import torch.optim as optim

optimizer = optim.AdamW(model.parameters(), lr=5e-5)
model.train()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
epochs = 3  # You can increase this if needed
device = torch.device("cpu")  # CPU setup

model.to(device)

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for batch_idx, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (batch_idx + 1) % 100 == 0 or batch_idx == 0:
            print(f"Batch {batch_idx+1}, Loss: {loss.item():.4f}")
print("Training complete!")
