In [1]:
from google.colab import drive

# This will prompt you to authorize access
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import re
import pandas as pd
from typing import List, Dict, Any
from datetime import datetime
import html

Dataset Definition

In [3]:

file_path = '/content/drive/My Drive/NLP_DATASET/email_thread_summaries.csv'
email_thread_summaries_dataset = pd.read_csv(file_path)

# Check the first 5 rows
email_thread_summaries_dataset.head()


Unnamed: 0,thread_id,summary
0,1,The email thread discusses the Master Terminat...
1,2,A lunch meeting has been scheduled for May 5th...
2,3,Ben is updating a friend on his progress with ...
3,4,The recipient of the email thread initially ex...
4,5,The email thread discusses the long form confi...


In [4]:

file_path = '/content/drive/My Drive/NLP_DATASET/email_thread_details.csv'
email_thread_details_dataset = pd.read_csv(file_path)

# Check the first 5 rows
email_thread_details_dataset.head()

Unnamed: 0,thread_id,subject,timestamp,from,to,body
0,1,FW: Master Termination Log,2002-01-29 11:23:42,"Gossett, Jeffrey C. JGOSSET","['Giron', 'Darron C. Dgiron', 'Love', 'Phillip...",\n\n -----Original Message-----\nFrom: =09Ther...
1,1,FW: Master Termination Log,2002-01-31 12:50:00,"Theriot, Kim S. KTHERIO","['Murphy', 'Melissa Mmurphy', 'Gossett', 'Jeff...",\n\n -----Original Message-----\nFrom: =09Panu...
2,1,FW: Master Termination Log,2002-02-05 15:03:35,"Theriot, Kim S. KTHERIO","['Murphy', 'Melissa Mmurphy', 'Anderson', 'Dia...",Note to Stephanie Panus....\n\nStephanie...ple...
3,1,FW: Master Termination Log,2002-02-05 15:06:25,"Theriot, Kim S. KTHERIO","['Hall', 'D. Todd Thall', 'Sweeney', 'Kevin Ks...",\n\n -----Original Message-----\nFrom: =09Panu...
4,1,FW: Master Termination Log,2002-05-28 07:20:35,"Kelly, Katherine L. KKELLY","['Germany', 'Chris Cgerman']",\n\n -----Original Message-----\nFrom: =09McMi...


In [5]:
len(email_thread_details_dataset)

21684

Explore the Datasets

In [6]:
print(email_thread_details_dataset['timestamp'].dtype)

object


In [7]:
email_thread_details_dataset['timestamp']=pd.to_datetime(email_thread_details_dataset['timestamp'])

In [8]:
FilteredDataset= email_thread_details_dataset[(email_thread_details_dataset['thread_id'] ==27)].sort_values(by='timestamp', ascending=True)
FilteredDataset.head(20)

Unnamed: 0,thread_id,subject,timestamp,from,to,body
147,27,RE: Admission Visit,2000-02-10 06:36:00,Benjamin Rogers,['Meg Brooks <Meg.Brooks@bus.utexas.edu'],Thanks for the fast reply. I changed it onlin...
148,27,RE: Admission Visit,2000-02-21 11:23:00,Benjamin Rogers,['Meg Brooks <Meg.Brooks@bus.utexas.edu> @ ENR...,Thanks for the informative infomation session...
149,27,RE: Admission Visit,2000-03-05 23:49:00,Benjamin Rogers,['Meg Brooks <Meg.Brooks@bus.utexas.edu> @ ENR...,Meg:\nI was wondering if you are able to give ...
150,27,RE: Admission Visit,2000-03-06 04:48:00,Benjamin Rogers,['Meg Brooks <Meg.Brooks@bus.utexas.edu> @ ENR...,Thanks for your fast response. I really hope ...
151,27,RE: Admission Visit,2000-03-07 11:10:00,Benjamin Rogers,['Meg Brooks <Meg.Brooks@bus.utexas.edu> @ ENR...,Meg:\nI would like to make sure that I have fu...


Data Cleaning

In [9]:
# email_thread_details_dataset = email_thread_details_dataset.sort_values(by='timestamp').drop_duplicates(
#         subset=['thread_id', 'from', 'timestamp','to'],
#         keep='first'
#     )

In [10]:
def threads_preprocess(email_thread_details_dataset):

  # 1. Normalize subject
  # ------------------------------
  email_thread_details_dataset['subject'] = (
      email_thread_details_dataset['subject']
      .str.replace(r'^\s*((re|fw|fwd)\s*:\s*)+', '', regex=True, case=False)
      .str.strip()
  )

  # 2. Convert timestamp and sort
  # ------------------------------
  email_thread_details_dataset['timestamp'] = pd.to_datetime(email_thread_details_dataset['timestamp'])
  email_thread_details_dataset = email_thread_details_dataset.sort_values(['thread_id', 'timestamp'])

  # ------------------------------
  # 3. De-duplicate by sender+timestamp+recipient
  # ------------------------------
  email_thread_details_dataset = email_thread_details_dataset.drop_duplicates(
      subset=['thread_id', 'from', 'timestamp', 'to'],
      keep='first'
  )

  # ------------------------------
  # 4. Remove threads with only repeated content
  # ------------------------------
  threads = email_thread_details_dataset.groupby('thread_id', group_keys=False)
  email_thread_details_dataset = threads.filter(lambda x: x['body'].nunique() > 1)

  # 7. Anonymization
  # ------------------------------
  # def anonymize_text(text):
  #     # Keep only first names
  #     text = re.sub(r'\b([A-Z][a-z]+)\s+[A-Z][a-z]+\b', r'\1', text)
  #     # Remove sensitive words
  #     text = re.sub(r'\b(password|pwd|confidential)\b', '', text, flags=re.I)
  #     # Replace email, phone, URL, IP, path, numbers
  #     text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', 'USERNAME@DOMAIN.COM', text)
  #     text = re.sub(r'\b\d{10,}\b', 'PHONENUMBER', text)
  #     text = re.sub(r'\b\d+\b', 'NUMBER', text)
  #     text = re.sub(r'http[s]?://\S+', 'HTTP://LINK', text)
  #     text = re.sub(r'\b\d{1,3}(?:\.\d{1,3}){3}\b', 'IPADDRESS', text)
  #     text = re.sub(r'(?:[A-Za-z]:)?[/\\][\w/\\.-]+', 'PATH', text)
  #     return text

  # email_thread_details_dataset['body'] = email_thread_details_dataset['body'].apply(anonymize_text)


  #email_thread_details_dataset=email_thread_details_dataset.groupby("thread_id")

  return email_thread_details_dataset




In [11]:
class EmailBodyPreprocessor:
    def __init__(self):
        # Common email patterns
        self.forward_patterns = [
            r'----- Forwarded by .*? on \d{2}/\d{2}/\d{4}.*?-----',
            r'-----Original Message-----',
            r'----- Forwarded Message -----',
            r'From:.*?Sent:.*?To:.*?Subject:',
        ]

        self.signature_patterns = [
            r'\nStephanie Panus.*?\d{3}\.\d{3}\.\d{4}',
            r'\nBrian.*?\n\n',
            r'\nThanks,?\n.*',
            r'\nBest regards,?\n.*',
            r'\nSincerely,?\n.*',
            r'\nRegards,?\n.*',
            r'\n-\s*\n.*',
            r'ph:\s*\d{3}\.\d{3}\.\d{4}.*?fax:\s*\d{3}\.\d{3}\.\d{4}',
        ]

        # Email header patterns
        self.header_patterns = [
            r'From:\s*(.*?)\n',
            r'Sent:\s*(.*?)\n',
            r'To:\s*(.*?)\n',
            r'Cc:\s*(.*?)\n',
            r'Subject:\s*(.*?)\n',
        ]

    def clean_encoding_artifacts(self, text: str) -> str:
        """Clean encoding issues from emails."""
        # Remove =09, =20, etc.
        text = re.sub(r'=\d{2}', ' ', text)
        # Fix line breaks with = at end
        text = re.sub(r'=\n', '', text)
        # Remove non-breaking spaces and other special chars
        text = re.sub(r'[\xa0\u200b\u200c\u200d]', ' ', text)
        # HTML entities
        text = html.unescape(text)
        return text

    def extract_email_headers(self, text: str) -> Dict[str, Any]:
        """Extract email header information from body."""
        headers = {
            'from': None,
            'sent_date': None,
            'to': [],
            'cc': [],
            'subject': None,
            'is_forwarded': False,
            'is_replied': False
        }

        # Check if this is forwarded/replied email
        if '-----Original Message-----' in text or '----- Forwarded by' in text:
            headers['is_forwarded'] = True

        if 'Re:' in text[:100] or 'RE:' in text[:100] or 'Fwd:' in text[:100] or 'FW:' in text[:100]:
            headers['is_replied'] = True

        # Try to extract headers
        for pattern in self.header_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                key = pattern.split(':')[0].lower()
                value = match.group(1).strip()
                if key == 'from':
                    headers['from'] = value
                elif key == 'sent':
                    headers['sent_date'] = value
                elif key == 'to':
                    # Split multiple recipients
                    recipients = re.split(r'[;,]\s*', value)
                    headers['to'] = [r.strip() for r in recipients if r.strip()]
                elif key == 'cc':
                    recipients = re.split(r'[;,]\s*', value)
                    headers['cc'] = [r.strip() for r in recipients if r.strip()]
                elif key == 'subject':
                    headers['subject'] = value

        return headers

    def remove_quoted_text(self, text: str) -> str:
        """Remove quoted/replied text from email body."""
        lines = text.split('\n')
        cleaned_lines = []
        in_quoted_section = False
        quote_depth = 0

        for line in lines:
            # Check for forwarded/quote markers
            if any(pattern in line for pattern in [
                '-----Original Message-----',
                '----- Forwarded by',
                'From: ',
                'Sent: ',
                'To: ',
                'Subject: '
            ]):
                if '-----Original Message-----' in line or '----- Forwarded by' in line:
                    in_quoted_section = True
                    quote_depth += 1
                continue

            # Check for email header lines in quotes
            if in_quoted_section:
                if line.strip() == '' and quote_depth == 1:
                    # Empty line might end the header section
                    continue
                elif re.match(r'^\s*On.*wrote:$', line):
                    # Common reply pattern
                    continue
                elif line.strip().startswith('>') or line.strip().startswith('|'):
                    # Quoted text markers
                    continue
                elif quote_depth > 0 and not line.strip():
                    # Decrease depth on empty lines in quotes
                    quote_depth -= 1
                    if quote_depth == 0:
                        in_quoted_section = False
                    continue

            if not in_quoted_section:
                cleaned_lines.append(line)

        return '\n'.join(cleaned_lines)

    def remove_signatures(self, text: str) -> str:
        """Remove email signatures."""
        for pattern in self.signature_patterns:
            text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)

        # Remove common closing patterns
        closing_patterns = [
            r'\n\s*--\s*\n.*',
            r'\n\s*---\s*\n.*',
            r'\nSent from my.*',
            r'\nConfidentiality Notice.*',
        ]

        for pattern in closing_patterns:
            text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)

        return text

    def clean_email_body(self, text: str) -> str:
        """Main cleaning function for email body."""
        if not isinstance(text, str):
            return ""

        # Step 1: Clean encoding artifacts
        text = self.clean_encoding_artifacts(text)

        # Step 2: Extract headers (store separately if needed)
        headers = self.extract_email_headers(text)

        # Step 3: Remove quoted/replied text
        text = self.remove_quoted_text(text)

        # Step 4: Remove signatures
        text = self.remove_signatures(text)

        # Step 5: Normalize whitespace
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\n\s*\n', '\n\n', text)  # Preserve paragraph breaks

        # Step 6: Clean up common email artifacts
        text = re.sub(r'\s*<\s*File:.*?>\s*', ' [ATTACHMENT] ', text)
        text = re.sub(r'\[.*?@.*?\]', '', text)

        # Step 7: Trim and return
        text = text.strip()

        return text


    def preprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """Preprocess the entire dataframe."""
        processed_df = df.copy()

        processed_df['body'] = processed_df['body'].apply(self.clean_email_body)

        return processed_df

    def preprocess_threads(self, df: pd.Series) -> pd.Series:
        """Preprocess the entire dataframe."""
        processed_df = df.copy()

        processed_df = processed_df.apply(self.clean_email_body)

        return processed_df


def clean_email(df_email_details):
    df = df_email_details.copy()

    preprocessor = EmailBodyPreprocessor()

    preprocess_df = preprocessor.preprocess_dataframe(df)

    return preprocess_df

def clean_thread_series(thread):

    preprocessor = EmailBodyPreprocessor()

    preprocess_df = preprocessor.preprocess_threads(thread)

    return preprocess_df

In [12]:
cleaned_threads = threads_preprocess(email_thread_details_dataset)
cleaned_threads

Unnamed: 0,thread_id,subject,timestamp,from,to,body
0,1,Master Termination Log,2002-01-29 11:23:42,"Gossett, Jeffrey C. JGOSSET","['Giron', 'Darron C. Dgiron', 'Love', 'Phillip...",\n\n -----Original Message-----\nFrom: =09Ther...
1,1,Master Termination Log,2002-01-31 12:50:00,"Theriot, Kim S. KTHERIO","['Murphy', 'Melissa Mmurphy', 'Gossett', 'Jeff...",\n\n -----Original Message-----\nFrom: =09Panu...
2,1,Master Termination Log,2002-02-05 15:03:35,"Theriot, Kim S. KTHERIO","['Murphy', 'Melissa Mmurphy', 'Anderson', 'Dia...",Note to Stephanie Panus....\n\nStephanie...ple...
3,1,Master Termination Log,2002-02-05 15:06:25,"Theriot, Kim S. KTHERIO","['Hall', 'D. Todd Thall', 'Sweeney', 'Kevin Ks...",\n\n -----Original Message-----\nFrom: =09Panu...
4,1,Master Termination Log,2002-05-28 07:20:35,"Kelly, Katherine L. KKELLY","['Germany', 'Chris Cgerman']",\n\n -----Original Message-----\nFrom: =09McMi...
...,...,...,...,...,...,...
21679,4166,vacation,2000-10-04 11:32:00,Sara Shackleton,"['Gary Hickerson', 'Sheila Glover', 'Laurel Ad...",I will be on vacation from October 6- 13. Als...
21680,4167,web file,2001-03-18 22:57:00,Matt Smith,['Amanda Huble'],"Amanda,\n\nCan you put this file in the approp..."
21681,4167,web file,2001-03-19 04:42:00,Matt Smith,['Amanda Huble'],"Amanda,\n\nPlease move the file i sent you fro..."
21682,4167,web file,2001-03-19 09:57:00,Matt Smith,['Amanda Huble <Amanda Huble/NA/Enron@Enron'],"Amanda,\n\nCan you put this file in the approp..."


In [13]:

cleaned_df = clean_email(cleaned_threads)

cleaned_df

Unnamed: 0,thread_id,subject,timestamp,from,to,body
0,1,Master Termination Log,2002-01-29 11:23:42,"Gossett, Jeffrey C. JGOSSET","['Giron', 'Darron C. Dgiron', 'Love', 'Phillip...",
1,1,Master Termination Log,2002-01-31 12:50:00,"Theriot, Kim S. KTHERIO","['Murphy', 'Melissa Mmurphy', 'Gossett', 'Jeff...",
2,1,Master Termination Log,2002-02-05 15:03:35,"Theriot, Kim S. KTHERIO","['Murphy', 'Melissa Mmurphy', 'Anderson', 'Dia...",Note to Stephanie Panus.... Stephanie...please...
3,1,Master Termination Log,2002-02-05 15:06:25,"Theriot, Kim S. KTHERIO","['Hall', 'D. Todd Thall', 'Sweeney', 'Kevin Ks...",
4,1,Master Termination Log,2002-05-28 07:20:35,"Kelly, Katherine L. KKELLY","['Germany', 'Chris Cgerman']",
...,...,...,...,...,...,...
21679,4166,vacation,2000-10-04 11:32:00,Sara Shackleton,"['Gary Hickerson', 'Sheila Glover', 'Laurel Ad...",I will be on vacation from October 6- 13. Also...
21680,4167,web file,2001-03-18 22:57:00,Matt Smith,['Amanda Huble'],"Amanda, Can you put this file in the appropria..."
21681,4167,web file,2001-03-19 04:42:00,Matt Smith,['Amanda Huble'],"Amanda, Please move the file i sent you from t..."
21682,4167,web file,2001-03-19 09:57:00,Matt Smith,['Amanda Huble <Amanda Huble/NA/Enron@Enron'],"Amanda, Can you put this file in the appropria..."


In [14]:
thread_cleaned_grouped=email_thread_details_dataset.groupby("thread_id")
thread_cleaned_grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ee6b9f3ba70>

In [15]:
# Combine emails per thread
def thread_to_text(thread_df):
    parts = []
    for _, row in thread_df.iterrows():
        part = f"From: {row['from']} To: {''.join(row['to'])} Time: {row['timestamp']} Body: {row['body']} "
        parts.append(part)
    return " ".join(parts)

thread_texts = thread_cleaned_grouped.apply(thread_to_text)

  thread_texts = thread_cleaned_grouped.apply(thread_to_text)


In [16]:
thread_texts

Unnamed: 0_level_0,0
thread_id,Unnamed: 1_level_1
1,"From: Gossett, Jeffrey C. JGOSSET To: ['Giron'..."
2,From: Tana Jones To: ['Suzanne Adams'] Time: 2...
3,"From: Benjamin Rogers To: ['""CHOBY', 'C."" <G7P..."
4,From: Phillip M Love To: ['Julie Ferrara'] Tim...
5,From: Kay Mann To: ['Reagan Rorschach'] Time: ...
...,...
4163,"From: Kay Mann To: ['Sheila Tweed', 'Dale Rasm..."
4164,From: Elizabeth Sager To: ['Genia FitzGerald']...
4165,"From: Watson, Kimberly KWATSON To: [""'john.wat..."
4166,"From: Susan Scott To: ['Drew Fossum@ENRON', 'J..."


In [17]:
def merge_thread_summary(thread_texts,df):# Convert Series to DataFrame with a column name
    thread_df = thread_texts.to_frame('thread_text')

    # Reset index if needed (if Series has an index you want to keep)
    thread_df = thread_texts.reset_index()

    thread_df['thread_text'] = thread_df[0]
    thread_df = thread_df.drop(columns=[0])

    # Join using merge
    result = pd.merge(df, thread_df,  how='left' , on = 'thread_id')

    return result


In [18]:
merged_df = merge_thread_summary(thread_texts,email_thread_summaries_dataset)
merged_df

Unnamed: 0,thread_id,summary,thread_text
0,1,The email thread discusses the Master Terminat...,"From: Gossett, Jeffrey C. JGOSSET To: ['Giron'..."
1,2,A lunch meeting has been scheduled for May 5th...,From: Tana Jones To: ['Suzanne Adams'] Time: 2...
2,3,Ben is updating a friend on his progress with ...,"From: Benjamin Rogers To: ['""CHOBY', 'C."" <G7P..."
3,4,The recipient of the email thread initially ex...,From: Phillip M Love To: ['Julie Ferrara'] Tim...
4,5,The email thread discusses the long form confi...,From: Kay Mann To: ['Reagan Rorschach'] Time: ...
...,...,...,...
4162,4163,Peter Thompson has sent a memo to Kay Mann and...,"From: Kay Mann To: ['Sheila Tweed', 'Dale Rasm..."
4163,4164,The email thread revolves around the sharing a...,From: Elizabeth Sager To: ['Genia FitzGerald']...
4164,4165,Susan asks Emily about her plans for the weeke...,"From: Watson, Kimberly KWATSON To: [""'john.wat..."
4165,4166,Several employees will be on vacation during d...,"From: Susan Scott To: ['Drew Fossum@ENRON', 'J..."


Model:


In [19]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample data
#texts = ["Dear team, please review the attached report on vulnerabilities. We need feedback by Friday."]
#summaries = ["Review report on vulnerabilities by Friday."]
texts = merged_df['thread_text']
summaries = merged_df['summary']

# Tokenizer for input
src_tokenizer = Tokenizer(filters='', oov_token='<unk>')
src_tokenizer.fit_on_texts(texts)
src_sequences = src_tokenizer.texts_to_sequences(texts)
src_sequences = pad_sequences(src_sequences, padding='post')

# Tokenizer for target (summary)
trg_tokenizer = Tokenizer(filters='', oov_token='<unk>')
trg_tokenizer.fit_on_texts(summaries)
trg_sequences = trg_tokenizer.texts_to_sequences(summaries)
trg_sequences = pad_sequences(trg_sequences, padding='post')

# Vocabulary sizes
src_vocab_size = len(src_tokenizer.word_index) + 1
trg_vocab_size = len(trg_tokenizer.word_index) + 1


In [20]:
from tensorflow.keras import layers

embedding_dim = 128
hidden_units = 256

# Encoder
encoder_inputs = tf.keras.Input(shape=(None,))
enc_emb = layers.Embedding(src_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = layers.LSTM(hidden_units, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]


In [21]:
decoder_inputs = tf.keras.Input(shape=(None,))
dec_emb = layers.Embedding(trg_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = layers.LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = layers.Dense(trg_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [22]:
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [23]:
import numpy as np

trg_input = trg_sequences[:, :-1]   # all tokens except last
trg_output = trg_sequences[:, 1:]   # all tokens except first
trg_output = np.expand_dims(trg_output, -1)  # required for sparse_categorical_crossentropy


In [None]:
model.fit(
    [src_sequences, trg_input],
    trg_output,
    batch_size=16,
    epochs=50
)


Epoch 1/50
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m477s[0m 2s/step - accuracy: 0.5392 - loss: 4.9267
Epoch 2/50
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m474s[0m 2s/step - accuracy: 0.5802 - loss: 3.1683
Epoch 3/50
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m474s[0m 2s/step - accuracy: 0.5910 - loss: 3.0204
Epoch 4/50
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m478s[0m 2s/step - accuracy: 0.6009 - loss: 2.9031
Epoch 5/50
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m479s[0m 2s/step - accuracy: 0.6162 - loss: 2.7612
Epoch 6/50
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m480s[0m 2s/step - accuracy: 0.6241 - loss: 2.6462
Epoch 7/50
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m480s[0m 2s/step - accuracy: 0.6313 - loss: 2.5346
Epoch 8/50
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m480s[0m 2s/step - accuracy: 0.6360 - loss: 2.4398
Epoch 9/50
[1m261/261[0m [32m