In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from tqdm.notebook import tqdm
df = pd.read_csv('main_data.csv')
pd.set_option('display.max_rows', None)   # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Adjust the width to display full table
pd.set_option('display.max_colwidth', None)  # Adjust column width to avoid truncation


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480062 entries, 0 to 480061
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   Message-ID                 480062 non-null  object
 1   Date                       480062 non-null  object
 2   Time                       480062 non-null  int64 
 3   From                       480062 non-null  object
 4   To                         480062 non-null  object
 5   Subject                    480062 non-null  object
 6   X-cc                       480062 non-null  object
 7   X-bcc                      480062 non-null  object
 8   Content                    480062 non-null  object
 9   Job_Title                  480062 non-null  object
 10  Total_Sentence_Word_Count  480062 non-null  int64 
dtypes: int64(2), object(9)
memory usage: 40.3+ MB


In [8]:
# df.head(1)

In [3]:
def extract_names(email):
    if pd.isna(email):
        return None
    # Extract the username part of the email
    matches = re.findall(r'([\w\.-]+)@[\w\.-]+', email)
    if matches:
        # Replace non-alphabetic characters (except spaces) with a space
        cleaned_names = [re.sub(r'[^a-zA-Z]', ' ', name).strip() for name in matches]
        return ', '.join(cleaned_names)
    return None

tqdm.pandas()  # Enable tqdm for pandas apply
# Apply the function to both columns, handling multiple emails in a row
df['From_Names'] = df['From'].str.split(',').progress_apply(lambda x: [extract_names(email.strip()) for email in x])
df['To_Names'] = df['To'].str.split(',').progress_apply(lambda x: [extract_names(email.strip()) for email in x])

# Convert lists to comma-separated strings for readability
df['From_Names'] = df['From_Names'].progress_apply(lambda x: ', '.join(filter(None, x)))
df['To_Names'] = df['To_Names'].progress_apply(lambda x: ', '.join(filter(None, x)))


  0%|          | 0/480062 [00:00<?, ?it/s]

  0%|          | 0/480062 [00:00<?, ?it/s]

  0%|          | 0/480062 [00:00<?, ?it/s]

  0%|          | 0/480062 [00:00<?, ?it/s]

In [10]:
# df.head()

In [4]:


# # Download required NLTK data (run once if not already downloaded)
# nltk.download('punkt')
# nltk.download('stopwords')

# Preprocessing function with your custom stopwords included
def preprocess_text(text):
    # Normalize whitespace, lowercase, and strip
    text = re.sub(r'\s+', ' ', text.lower().strip())
    
    # Remove punctuation (keeps letters, numbers, and spaces only)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Get default English stopwords from NLTK
    stop_words = set(stopwords.words('english'))
    
    # Define your custom stopwords
    custom_stopwords = {
        "enron", "email", "subject", "company", "corporate", "mary", "hain", "hou", "ect", 
        "mark", "hainhouect", "haedickehouectect", "ect", "please", "would", "pm", "cc", 
        "may", "e", "forwarded", "attached", "attach", "thanks", "could", "mail", "mailing", 
        "bcc", "dear", "thru", "forwarded", "hi", "hello", "much", "really", "susan", "j", 
        "q", "p", "pls", "thank", "ps", "sorry", "also", "might", "must", "call", "fw", 
        "fwd", "date", "sincerely", "sent", "http", "list", "asap", "corp"
    }
    
    # Update stop_words with your custom stopwords
    stop_words.update(custom_stopwords)
    
    # Add additional stopwords if provided via parameter
    # if additional_stopwords:
    #     stop_words.update([word.lower() for word in additional_stopwords])
    
    # Remove stopwords (NLTK + custom + additional)
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into a string
    return ' '.join(tokens)


# Optional: Add more stopwords if needed
# extra_stopwords = ['world', 'prepare']

# Enable tqdm for pandas in Jupyter
tqdm.pandas()

# Apply preprocessing with progress bar
df['Cleaned_Content'] = df['Content'].progress_apply(
    lambda x: preprocess_text(x)
)


  0%|          | 0/480062 [00:00<?, ?it/s]

In [12]:
# df.head(1)

In [5]:
# Function to convert text to tokens (BoW)
def text_to_tokens(text):

    # Tokenize into words
    tokens = word_tokenize(text)
    
    return tokens

# Enable tqdm for pandas in Jupyter
tqdm.pandas()

# Apply tokenization with progress bar and add as new column
df['BoW'] = df['Cleaned_Content'].progress_apply(text_to_tokens)


  0%|          | 0/480062 [00:00<?, ?it/s]

In [8]:
# Clean the 'From' column
df['From'] = df['From'].str.strip().replace('', pd.NA)  # Strip whitespace and replace empty strings with NaN
df = df.dropna(subset=['From'])  # Drop rows where 'From' is NaN

# Filter rows where 'From' ends with '@enron.com' and does not contain numbers before '@enron.com'
df = df[
    df['From'].str.contains(r'@enron\.com$', na=False, regex=True) &  # Ends with '@enron.com'
    ~df['From'].str.contains(r'\d+@enron\.com$', na=False, regex=True)  # Does not contain numbers before '@enron.com'
]

In [11]:
new_df = df

In [12]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392971 entries, 0 to 480061
Data columns (total 15 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   Message-ID                 392971 non-null  object
 1   Date                       392971 non-null  object
 2   Time                       392971 non-null  int64 
 3   From                       392971 non-null  object
 4   To                         392971 non-null  object
 5   Subject                    392971 non-null  object
 6   X-cc                       392971 non-null  object
 7   X-bcc                      392971 non-null  object
 8   Content                    392971 non-null  object
 9   Job_Title                  392971 non-null  object
 10  Total_Sentence_Word_Count  392971 non-null  int64 
 11  From_Names                 392971 non-null  object
 12  To_Names                   392971 non-null  object
 13  Cleaned_Content            392971 non-null  objec

In [13]:
# Function to combine Date and Time, removing weekday
def combine_date_time(date, time):
    # Remove weekday and comma (e.g., 'Fri, ' -> '')
    date_cleaned = re.sub(r'^(Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s*', '', date.strip())
    # Combine with year
    return f"{date_cleaned} {time}"

# Enable tqdm for pandas in Jupyter
tqdm.pandas()

# Combine 'Date' and 'Time' into a new column 'DateTime'
new_df['DateTime'] = new_df.progress_apply(lambda row: combine_date_time(row['Date'], row['Time']), axis=1)

  0%|          | 0/392971 [00:00<?, ?it/s]

In [14]:
new_df = new_df.drop(columns = ['Date','Time','Message-ID'],errors ='ignore')

In [18]:
# new_df.head()

In [19]:
# new_df.Job_Title.value_counts()

In [15]:
# List of email addresses to filter out
emails_to_filter = [
    'all.worldwide@enron.com',
    'enron_announcements@enron.com',
    'issuealert@scientech.com',
    'outlook.team@enron.com',
    'Worldwide@ENRON',
    'dl-ga-all_enron_worldwide2@enron',
    'no.address@enron.com'
]

# Normalize emails_to_filter to lowercase for case-insensitive matching
emails_to_filter = [email.lower() for email in emails_to_filter]

# Create new DataFrame including specified emails
new_df2 = new_df[~new_df['From'].str.lower().isin(emails_to_filter)].copy()
new_df3 = new_df2[~new_df2['To'].str.lower().isin(emails_to_filter)].copy()

In [16]:
new_df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 384846 entries, 0 to 480061
Data columns (total 13 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   From                       384846 non-null  object
 1   To                         384846 non-null  object
 2   Subject                    384846 non-null  object
 3   X-cc                       384846 non-null  object
 4   X-bcc                      384846 non-null  object
 5   Content                    384846 non-null  object
 6   Job_Title                  384846 non-null  object
 7   Total_Sentence_Word_Count  384846 non-null  int64 
 8   From_Names                 384846 non-null  object
 9   To_Names                   384846 non-null  object
 10  Cleaned_Content            384846 non-null  object
 11  BoW                        384846 non-null  object
 12  DateTime                   384846 non-null  object
dtypes: int64(1), object(12)
memory usage: 41.1+ MB


In [17]:
new_df3.to_csv('finalv1_data.csv',index=False)

In [18]:
# 1. Count the number of rows in new_df3
row_count = len(new_df3)
print(f"Number of rows in new_df3: {row_count}")

# 2. Check for empty or missing values in 'From' column
# Count missing values (NaN or None)
missing_count = new_df3['From'].isna().sum()
print(f"Number of missing values (NaN/None) in 'From': {missing_count}")

# Count empty strings ('')
empty_string_count = (new_df3['From'] == '').sum()
print(f"Number of empty strings ('') in 'From': {empty_string_count}")

# Total rows with empty or missing 'From'
total_empty_or_missing = missing_count + empty_string_count
print(f"Total rows with empty or missing 'From': {total_empty_or_missing}")

Number of rows in new_df3: 384846
Number of missing values (NaN/None) in 'From': 0
Number of empty strings ('') in 'From': 0
Total rows with empty or missing 'From': 0


In [19]:
df_first_50k = new_df3.iloc[:50000]

In [20]:
df_first_50k.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 0 to 68343
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   From                       50000 non-null  object
 1   To                         50000 non-null  object
 2   Subject                    50000 non-null  object
 3   X-cc                       50000 non-null  object
 4   X-bcc                      50000 non-null  object
 5   Content                    50000 non-null  object
 6   Job_Title                  50000 non-null  object
 7   Total_Sentence_Word_Count  50000 non-null  int64 
 8   From_Names                 50000 non-null  object
 9   To_Names                   50000 non-null  object
 10  Cleaned_Content            50000 non-null  object
 11  BoW                        50000 non-null  object
 12  DateTime                   50000 non-null  object
dtypes: int64(1), object(12)
memory usage: 5.3+ MB


In [21]:
df_first_50k.to_csv('filter_50krows.csv',index=False)

In [22]:
df_first_50k.head()

Unnamed: 0,From,To,Subject,X-cc,X-bcc,Content,Job_Title,Total_Sentence_Word_Count,From_Names,To_Names,Cleaned_Content,BoW,DateTime
0,phillip.allen@enron.com,john.lavorato@enron.com,Re:,X-bcc:,"X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail","Traveling to have a business meeting takes the fun out of the trip. Especially if you have to prepare a presentation. I would suggest holding the business plan meetings here then take a trip without any formal business meetings. I would even try and get some honest opinions on whether a trip is even desired or necessary.\n\nAs far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not. Too often the presenter speaks and the others are quiet just waiting for their turn. The meetings might be better if held in a round table discussion format. \n\nMy suggestion for where to go is Austin. Play golf and rent a ski boat and jet ski's. Flying somewhere takes too much time.",Unknown,139,phillip allen,john lavorato,traveling business meeting takes fun trip especially prepare presentation suggest holding business plan meetings take trip without formal business meetings even try get honest opinions whether trip even desired necessary far business meetings think productive try stimulate discussions across different groups working often presenter speaks others quiet waiting turn meetings better held round table discussion format suggestion go austin play golf rent ski boat jet skis flying somewhere takes time,"[traveling, business, meeting, takes, fun, trip, especially, prepare, presentation, suggest, holding, business, plan, meetings, take, trip, without, formal, business, meetings, even, try, get, honest, opinions, whether, trip, even, desired, necessary, far, business, meetings, think, productive, try, stimulate, discussions, across, different, groups, working, often, presenter, speaks, others, quiet, waiting, turn, meetings, better, held, round, table, discussion, format, suggestion, go, austin, play, golf, rent, ski, boat, jet, skis, flying, somewhere, takes, time]",4 May 2001
1,phillip.allen@enron.com,randall.gay@enron.com,Mime-Version: 1.0,X-bcc:,X-Folder: \Phillip_Allen_Dec2000\Notes Folders\'sent mail,"Randy,\n\n Can you send me a schedule of the salary and level of everyone in the \nscheduling group. Plus your thoughts on any changes that need to be made. \n(Patti S for example)\n\nPhillip",Unknown,33,phillip allen,randall gay,randy send schedule salary level everyone scheduling group plus thoughts changes need made patti example phillip,"[randy, send, schedule, salary, level, everyone, scheduling, group, plus, thoughts, changes, need, made, patti, example, phillip]",23 Oct 2000
2,phillip.allen@enron.com,"david.l.johnson@enron.com, john.shafer@enron.com",Mime-Version: 1.0,X-bcc:,X-Folder: \Phillip_Allen_Dec2000\Notes Folders\'sent mail,Please cc the following distribution list with updates:\n\nPhillip Allen (pallen@enron.com)\nMike Grigsby (mike.grigsby@enron.com)\nKeith Holst (kholst@enron.com)\nMonique Sanchez\nFrank Ermis\nJohn Lavorato\n\n\nThank you for your help\n\nPhillip Allen,Unknown,22,phillip allen,"david l johnson, john shafer",following distribution updates phillip allen pallenenroncom mike grigsby mikegrigsbyenroncom keith holst kholstenroncom monique sanchez frank ermis john lavorato help phillip allen,"[following, distribution, updates, phillip, allen, pallenenroncom, mike, grigsby, mikegrigsbyenroncom, keith, holst, kholstenroncom, monique, sanchez, frank, ermis, john, lavorato, help, phillip, allen]",22 Aug 2000
3,phillip.allen@enron.com,mark.scott@enron.com,Re: High Speed Internet Access,X-bcc:,X-Folder: \Phillip_Allen_Dec2000\Notes Folders\'sent mail,1. login: pallen pw: ke9davis\n\n I don't think these are required by the ISP \n\n 2. static IP address\n\n IP: 64.216.90.105\n Sub: 255.255.255.248\n gate: 64.216.90.110\n DNS: 151.164.1.8\n\n 3. Company: 0413\n RC: 105891,Unknown,31,phillip allen,mark scott,1 login pallen pw ke9davis dont think required isp 2 static ip address ip 6421690105 sub 255255255248 gate 6421690110 dns 15116418 3 0413 rc 105891,"[1, login, pallen, pw, ke9davis, dont, think, required, isp, 2, static, ip, address, ip, 6421690105, sub, 255255255248, gate, 6421690110, dns, 15116418, 3, 0413, rc, 105891]",17 Oct 2000
4,phillip.allen@enron.com,zimam@enron.com,FW: fixed forward or other Collar floor gas price terms,X-bcc:,X-Folder: \Phillip_Allen_Dec2000\Notes Folders\'sent mail,"---------------------- Forwarded by Phillip K Allen/HOU/ECT on 10/16/2000 \n01:42 PM ---------------------------\n\n\n""Buckner, Buck"" <buck.buckner@honeywell.com> on 10/12/2000 01:12:21 PM\nTo: ""'Pallen@Enron.com'"" <Pallen@Enron.com>\ncc: \nSubject: FW: fixed forward or other Collar floor gas price terms\n\n\nPhillip,\n\n> As discussed during our phone conversation, In a Parallon 75 microturbine\n> power generation deal for a national accounts customer, I am developing a\n> proposal to sell power to customer at fixed or collar/floor price. To do\n> so I need a corresponding term gas price for same. Microturbine is an\n> onsite generation product developed by Honeywell to generate electricity\n> on customer site (degen). using natural gas. In doing so, I need your\n> best fixed price forward gas price deal for 1, 3, 5, 7 and 10 years for\n> annual/seasonal supply to microturbines to generate fixed kWh for\n> customer. We have the opportunity to sell customer kWh 's using\n> microturbine or sell them turbines themselves. kWh deal must have limited/\n> no risk forward gas price to make deal work. Therein comes Sempra energy\n> gas trading, truly you.\n>\n> We are proposing installing 180 - 240 units across a large number of\n> stores (60-100) in San Diego.\n> Store number varies because of installation hurdles face at small percent.\n>\n> For 6-8 hours a day Microturbine run time:\n> Gas requirement for 180 microturbines 227 - 302 MMcf per year\n> Gas requirement for 240 microturbines 302 - 403 MMcf per year\n>\n> Gas will likely be consumed from May through September, during peak\n> electric period.\n> Gas price required: Burnertip price behind (LDC) San Diego Gas & Electric\n> Need detail breakout of commodity and transport cost (firm or\n> interruptible).\n>\n> Should you have additional questions, give me a call.\n> Let me assure you, this is real deal!!\n>\n> Buck Buckner, P.E., MBA\n> Manager, Business Development and Planning\n> Big Box Retail Sales\n> Honeywell Power Systems, Inc.\n> 8725 Pan American Frwy\n> Albuquerque, NM 87113\n> 505-798-6424\n> 505-798-6050x\n> 505-220-4129\n> 888/501-3145\n>",Unknown,300,phillip allen,zimam,phillip k allenhouect 10162000 0142 buckner buck buckbucknerhoneywellcom 10122000 011221 pallenenroncom pallenenroncom fixed forward collar floor gas price terms phillip discussed phone conversation parallon 75 microturbine power generation deal national accounts customer developing proposal sell power customer fixed collarfloor price need corresponding term gas price microturbine onsite generation product developed honeywell generate electricity customer site degen using natural gas need best fixed price forward gas price deal 1 3 5 7 10 years annualseasonal supply microturbines generate fixed kwh customer opportunity sell customer kwh using microturbine sell turbines kwh deal limited risk forward gas price make deal work therein comes sempra energy gas trading truly proposing installing 180 240 units across large number stores 60100 san diego store number varies installation hurdles face small percent 68 hours day microturbine run time gas requirement 180 microturbines 227 302 mmcf per year gas requirement 240 microturbines 302 403 mmcf per year gas likely consumed september peak electric period gas price required burnertip price behind ldc san diego gas electric need detail breakout commodity transport cost firm interruptible additional questions give let assure real deal buck buckner pe mba manager business development planning big box retail sales honeywell power systems inc 8725 pan american frwy albuquerque nm 87113 5057986424 5057986050x 5052204129 8885013145,"[phillip, k, allenhouect, 10162000, 0142, buckner, buck, buckbucknerhoneywellcom, 10122000, 011221, pallenenroncom, pallenenroncom, fixed, forward, collar, floor, gas, price, terms, phillip, discussed, phone, conversation, parallon, 75, microturbine, power, generation, deal, national, accounts, customer, developing, proposal, sell, power, customer, fixed, collarfloor, price, need, corresponding, term, gas, price, microturbine, onsite, generation, product, developed, honeywell, generate, electricity, customer, site, degen, using, natural, gas, need, best, fixed, price, forward, gas, price, deal, 1, 3, 5, 7, 10, years, annualseasonal, supply, microturbines, generate, fixed, kwh, customer, opportunity, sell, customer, kwh, using, microturbine, sell, turbines, kwh, deal, limited, risk, forward, gas, price, make, deal, work, therein, comes, ...]",16 Oct 2000
