# Email Wizard Assistant: Data Preparation

This notebook demonstrates the data preparation process for the Email Wizard Assistant. We'll create a synthetic dataset of emails, preprocess them, and prepare them for embedding.

In [None]:
# Import necessary libraries
import os
import sys
import json
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Add the project root to the Python path
sys.path.insert(0, str(Path().resolve().parent))

# Import project modules
from src.data.dataset import create_synthetic_dataset, load_dataset, split_dataset
from src.data.preprocessing import preprocess_emails, save_preprocessed_emails

## 1. Create Synthetic Dataset

We'll create a synthetic dataset of 60 emails with diverse topics, senders, and content.

In [None]:
# Create data directories if they don't exist
os.makedirs("../data/raw", exist_ok=True)
os.makedirs("../data/processed", exist_ok=True)

# Create synthetic dataset
emails = create_synthetic_dataset(
    num_emails=60,
    output_path="../data/raw/synthetic_emails.json",
    preprocess=False  # We'll preprocess manually for demonstration
)

print(f"Created {len(emails)} synthetic emails")

## 2. Explore the Dataset

Let's examine the synthetic emails to understand their structure and content.

In [None]:
# Convert to DataFrame for easier exploration
emails_df = pd.DataFrame(emails)

# Display basic information
print("Dataset shape:", emails_df.shape)
print("\nColumns:")
for col in emails_df.columns:
    print(f"- {col}")

# Extract metadata into separate columns
emails_df['subject'] = emails_df['metadata'].apply(lambda x: x.get('subject', ''))
emails_df['sender'] = emails_df['metadata'].apply(lambda x: x.get('sender', ''))
emails_df['recipient'] = emails_df['metadata'].apply(lambda x: x.get('recipient', ''))
emails_df['topic'] = emails_df['metadata'].apply(lambda x: x.get('topic', ''))

# Display topic distribution
topic_counts = emails_df['topic'].value_counts()
print("\nTopic distribution:")
print(topic_counts)

# Plot topic distribution
plt.figure(figsize=(10, 6))
topic_counts.plot(kind='bar')
plt.title('Email Topic Distribution')
plt.xlabel('Topic')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

Let's examine a sample email to understand its structure:

In [None]:
# Display a sample email
sample_email_idx = 0
sample_email = emails[sample_email_idx]

print(f"Email ID: {sample_email['id']}")
print(f"Metadata: {json.dumps(sample_email['metadata'], indent=2)}")
print("\nContent:")
print(sample_email['content'])

## 3. Preprocess the Emails

Now, let's preprocess the emails to clean and normalize the text, extract metadata, and chunk long emails if necessary.

In [None]:
# Preprocess the emails
processed_emails = preprocess_emails(emails)

# Save the preprocessed emails
save_preprocessed_emails(
    processed_emails,
    "../data/processed/processed_emails.json"
)

print(f"Preprocessed and saved {len(processed_emails)} emails")

Let's examine a preprocessed email to see the changes:

In [None]:
# Display a sample preprocessed email
sample_processed_email = processed_emails[sample_email_idx]

print(f"Email ID: {sample_processed_email['id']}")
print(f"Metadata: {json.dumps(sample_processed_email['metadata'], indent=2)}")
print("\nRaw Text:")
print(sample_processed_email['raw_text'])
print("\nCleaned Text:")
print(sample_processed_email['cleaned_text'])
print("\nChunks:")
for i, chunk in enumerate(sample_processed_email['chunks']):
    print(f"Chunk {i+1}:")
    print(chunk)
    print()

## 4. Split the Dataset

Let's split the dataset into training, validation, and test sets for model evaluation.

In [None]:
# Split the dataset
train_emails, val_emails, test_emails = split_dataset(
    processed_emails,
    train_ratio=0.8,
    val_ratio=0.1,
    test_ratio=0.1,
    random_seed=42
)

print(f"Train set: {len(train_emails)} emails")
print(f"Validation set: {len(val_emails)} emails")
print(f"Test set: {len(test_emails)} emails")

## 5. Save the Split Dataset

Let's save the split dataset for later use.

In [None]:
# Save the split dataset
os.makedirs("../data/processed/split", exist_ok=True)

# Save train set
save_preprocessed_emails(
    train_emails,
    "../data/processed/split/train_emails.json"
)

# Save validation set
save_preprocessed_emails(
    val_emails,
    "../data/processed/split/val_emails.json"
)

# Save test set
save_preprocessed_emails(
    test_emails,
    "../data/processed/split/test_emails.json"
)

print("Saved split dataset")

## 6. Summary

In this notebook, we've:

1. Created a synthetic dataset of 60 emails with diverse topics and content
2. Explored the dataset to understand its structure and distribution
3. Preprocessed the emails to clean and normalize the text
4. Split the dataset into training, validation, and test sets
5. Saved the processed and split dataset for later use

The preprocessed emails are now ready for embedding and retrieval in the next notebook.