<a href="https://colab.research.google.com/github/Nahrawen/AIChatBot/blob/main/AIChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import re
from email.parser import BytesParser
from bs4 import BeautifulSoup
from email import message_from_string

In [3]:
data = pd.read_csv("/content/drive/MyDrive/emails.csv", nrows=70000)

In [9]:
# Example of an email message
data['message'][0]

"Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>\nDate: Mon, 14 May 2001 16:39:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: tim.belden@enron.com\nSubject: \nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Tim Belden <Tim Belden/Enron@EnronXGate>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nHere is our forecast\n\n "

In [10]:
# Function to clean email data
def clean_email(email_str):
    msg = message_from_string(email_str)

    from_address = msg.get('From')
    to_address = msg.get('To')
    subject = msg.get('Subject')
    date = msg.get('Date')
    body = ""

    if msg.is_multipart():
        for part in msg.get_payload():
            if part.get_content_type() == 'text/html':
                html_content = part.get_payload()
                soup = BeautifulSoup(html_content, "html.parser")
                body = soup.get_text(separator=' ')
                break
    else:
        body = msg.get_payload()

    return from_address, to_address, subject, date, body

In [11]:
# Apply the cleaning function to the dataset
cleaned_data = pd.DataFrame([clean_email(data["message"][i]) for i in range(len(data))],
                            columns=['from_address', 'to_address', 'subject', 'date', 'body'])

In [12]:
# Drop rows with missing 'to_address'
cleaned_data.dropna(subset=['to_address'], inplace=True)

In [13]:
# Remove empty bodies
cleaned_data = cleaned_data[cleaned_data["body"] != ""]

In [14]:
# Drop duplicate rows
cleaned_data.drop_duplicates(inplace=True)

In [15]:
cleaned_data.head()

Unnamed: 0,from_address,to_address,subject,date,body
0,phillip.allen@enron.com,tim.belden@enron.com,,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",Here is our forecast\n\n
1,phillip.allen@enron.com,john.lavorato@enron.com,Re:,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",Traveling to have a business meeting takes the...
2,phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",test successful. way to go!!!
3,phillip.allen@enron.com,randall.gay@enron.com,,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)","Randy,\n\n Can you send me a schedule of the s..."
4,phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",Let's shoot for Tuesday at 11:45.


In [16]:
# Function to extract valid email addresses
def extract_email(email_str):
    left = ''
    right = ''
    email = '@'

    if '@' in email_str:
        left, right = email_str.split('@', 1)

        for i in right:
            if i.isalnum() or (i == '.' and right[0] != '.'):
                email += i
            else:
                break

        for i in left[::-1]:
            if i.isalnum() or (i == '.' and left[-1] != '.'):
                email = i + email
            else:
                break

    return email

In [17]:
cleaned_data['from_address'] = cleaned_data['from_address'].apply(extract_email)
cleaned_data['to_address'] = cleaned_data['to_address'].apply(extract_email)

In [18]:
# Filter out invalid email addresses
cleaned_data = cleaned_data[(cleaned_data['from_address'] != '@') & (cleaned_data['to_address'] != '@')]
cleaned_data = cleaned_data[~(cleaned_data['from_address'].str.startswith('.') | cleaned_data['from_address'].str.endswith('.'))]
cleaned_data = cleaned_data[~(cleaned_data['to_address'].str.startswith('.') | cleaned_data['to_address'].str.endswith('.'))]

In [19]:
# Normalize text fields
def normalize_text(text):
    text = text.replace('\n', ' ')
    return text

cleaned_data['body'] = cleaned_data['body'].apply(normalize_text)
cleaned_data['subject'] = cleaned_data['subject'].apply(normalize_text)

In [20]:
# Clean and convert dates
cleaned_data['date'] = cleaned_data['date'].str.replace(r"\s\([A-Z]{3}\)", "")
cleaned_data["date"] = pd.to_datetime(cleaned_data['date'], utc=True)

  cleaned_data["date"] = pd.to_datetime(cleaned_data['date'], utc=True)


In [21]:
# Extract date-related features
cleaned_data['year'] = cleaned_data['date'].dt.year
cleaned_data['month'] = cleaned_data['date'].dt.month
cleaned_data['day'] = cleaned_data['date'].dt.day
cleaned_data['hour'] = cleaned_data['date'].dt.hour
cleaned_data['minute'] = cleaned_data['date'].dt.minute
cleaned_data['second'] = cleaned_data['date'].dt.second
cleaned_data['day_of_week'] = cleaned_data['date'].dt.dayofweek
cleaned_data['day_name'] = cleaned_data['date'].dt.day_name()


In [22]:
# Save cleaned data to Google Drive
cleaned_file_path = '/content/drive/MyDrive/cleaned_enron_emails.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

In [23]:
# Display the first few rows of the cleaned data
cleaned_data.head()

Unnamed: 0,from_address,to_address,subject,date,body,year,month,day,hour,minute,second,day_of_week,day_name
0,phillip.allen@enron.com,tim.belden@enron.com,,2001-05-14 23:39:00+00:00,Here is our forecast,2001,5,14,23,39,0,0,Monday
1,phillip.allen@enron.com,john.lavorato@enron.com,Re:,2001-05-04 20:51:00+00:00,Traveling to have a business meeting takes the...,2001,5,4,20,51,0,4,Friday
2,phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,2000-10-18 10:00:00+00:00,test successful. way to go!!!,2000,10,18,10,0,0,2,Wednesday
3,phillip.allen@enron.com,randall.gay@enron.com,,2000-10-23 13:13:00+00:00,"Randy, Can you send me a schedule of the sal...",2000,10,23,13,13,0,0,Monday
4,phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,2000-08-31 12:07:00+00:00,Let's shoot for Tuesday at 11:45.,2000,8,31,12,7,0,3,Thursday


In [24]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(separator=' ')
cleaned_data['body'] = cleaned_data['body'].apply(remove_html_tags)

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


In [25]:
cleaned_data['body'] = cleaned_data['body'].apply(lambda x: x.encode('ascii', errors='ignore').decode('ascii'))

In [26]:
def clean_subject(subject):
    subject = subject.strip().capitalize()
    return subject
cleaned_data['subject'] = cleaned_data['subject'].apply(clean_subject)

In [27]:
cleaned_data = cleaned_data[cleaned_data['subject'] != ""]

In [28]:
print(cleaned_data.isnull().sum())
print(cleaned_data.duplicated().sum())
cleaned_data.sample(5)

from_address    0
to_address      0
subject         0
date            0
body            0
year            0
month           0
day             0
hour            0
minute          0
second          0
day_of_week     0
day_name        0
dtype: int64
6


Unnamed: 0,from_address,to_address,subject,date,body,year,month,day,hour,minute,second,day_of_week,day_name
5488,swl@winelibrary.com,jarnold@enron.com,"95 pointer, 34% off wine, and super coupon ins...",2001-11-15 13:40:12+00:00,To Place an order . . . PLEASE CALL 973-376-00...,2001,11,15,13,40,12,3,Thursday
23079,charlene.jackson@enron.com,sally.beck@enron.com,Meeting,2000-04-13 13:25:00+00:00,"Sally, Thanks for the voice-mail. While we ha...",2000,4,13,13,25,0,3,Thursday
29530,marla.barnard@enron.com,sally.beck@enron.com,Re: fw: you asked for questions,2001-11-15 23:11:15+00:00,I will work to get you these answers tomorrow....,2001,11,15,23,11,15,3,Thursday
53659,shelley.corman@enron.com,don.vignaroli@dynegy.com,Fw: restated 2002 capex plan,2002-02-26 21:22:11+00:00,Don - I never heard anything back. Is everyon...,2002,2,26,21,22,11,1,Tuesday
59227,dhunter@smithandkempton.com,athomas@newenergy.com,Important -- letter to governor davis,2000-09-12 17:02:00+00:00,"Folks- At today's meeting, those Group members...",2000,9,12,17,2,0,1,Tuesday
