In [46]:
import pandas as pd
import re

In [47]:
df = pd.read_csv(r'dataset/emails.csv')

In [48]:
df.shape

(517401, 2)

In [49]:
# Sample 200,000 rows randomly
sampled_df = df.sample(n=200000, random_state=42)

# Save the sampled data for training
sampled_df.to_csv('dataset/sampled_enron_emails.csv', index=False)

In [50]:
sampled_df.shape

(200000, 2)

In [51]:
sampled_df.head(3)

Unnamed: 0,file,message
427616,shackleton-s/sent/1912.,Message-ID: <21013688.1075844564560.JavaMail.e...
108773,farmer-d/logistics/1066.,Message-ID: <22688499.1075854130303.JavaMail.e...
355471,parks-j/deleted_items/202.,Message-ID: <27817771.1075841359502.JavaMail.e...


In [52]:
def preprocess_text(text):
    # Extract the main message content by removing the headers
    message_start = re.search(r'\n\n', text)  # Look for the first double newline, which typically separates headers from the body
    if message_start:
        text = text[message_start.end():]
    
    # Perform further cleaning on the message body
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = re.sub(r'\n', ' ', text)  # Remove newlines
    text = re.sub(r'\w+@\w+\.\w+', '', text)  # Remove email addresses
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = text.strip()  # Remove leading/trailing whitespace
    return text

In [53]:
sampled_df['cleaned_text'] = sampled_df['message'].apply(preprocess_text)

In [54]:
# Initialize the category column
df['category'] = None

In [55]:
# Define the keywords for each category
keywords = {
    'support': ["check", "update", "error", "issue", "problem", "please", "forwarded"],
    'billing': ["invoice", "payment", "billing", "charge", "receipt"],
    'complaint': ["complain", "rip-off", "refund", "bad", "poor", "unhappy"],
    'feedback': ["thanks", "thank you", "info", "suggest", "idea", "opinion"]
}

In [56]:
# Function to label the emails based on keywords
def label_email(text):
    text = text.lower()
    for category, words in keywords.items():
        if any(word in text for word in words):
            return category
    return 'unlabeled'

In [57]:
sampled_df['category'] = sampled_df['cleaned_text'].apply(label_email)

In [58]:
sampled_df.to_csv('dataset/sampled_enron_emails.csv', index=False)

In [59]:
sampled_df['category'].value_counts()

category
support      127481
unlabeled     47536
feedback      20528
billing        2806
complaint      1649
Name: count, dtype: int64