## **Classifier POC - Urgent vs. Non-Urgent Messages**
### Data preparation

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('enron_emails.csv')
df.head(3)

Unnamed: 0,file,message
0,arnold-j/sent_items/315.,Message-ID: <31189540.1075852718882.JavaMail.e...
1,lenhart-m/sent/1210.,Message-ID: <932989.1075858142302.JavaMail.eva...
2,symes-k/sent/1136.,Message-ID: <20692876.1075841905289.JavaMail.e...


In [2]:
#extract subject from message
df['subject'] = df['message'].str.extract(r'Subject: (.*)\n')
#remove any RE: or FW: from the subject
df.subject.apply(lambda x: x.replace('RE:','').replace('FW:',''))

0                                                        
1                             Re: suicide pool - nfl 2000
2                                 Re: 4/10 checkout con't
3                                           Energy Issues
4                                             2001 budget
                              ...                        
1495                                   TW Weekly 11-17-00
1496                                       New York, etc.
1497     Letter re Unpaid Invoice for Post petition De...
1498                                                     
1499                    Tasters next thursday! (11-16-00)
Name: subject, Length: 1500, dtype: object

In [3]:
df.sample(10)

Unnamed: 0,file,message,subject
495,skilling-j/inbox/155.,Message-ID: <25099574.1075840154211.JavaMail.e...,Policy Committee Review - Analyst/Associate Pr...
1437,stepenovitch-j/all_documents/119.,Message-ID: <23458571.1075851830753.JavaMail.e...,24 Hour Group: NIGHT RESPONSIBILITIES
1204,white-s/deleted_items/163.,Message-ID: <1596411.1075841985758.JavaMail.ev...,Canceled: E&Y meeting on reconciliation of 113...
137,dasovich-j/sent/405.,Message-ID: <32473520.1075843193721.JavaMail.e...,Re: WSJ: PG&E's Huge losses...
176,mcconnell-m/sent/824.,Message-ID: <24504655.1075863625833.JavaMail.e...,Re: Need your help representing your Analysts
841,steffes-j/sent_items/844.,Message-ID: <4963991.1075852521483.JavaMail.ev...,Letter from Leadership Council to Bush WH
943,mims-thurston-p/sent/191.,Message-ID: <3130121.1075858213026.JavaMail.ev...,Re: Small business supply assistance
935,rogers-b/deleted_items/229.,Message-ID: <17655397.1075860725886.JavaMail.e...,
1144,allen-p/_sent_mail/484.,Message-ID: <10151683.1075855726699.JavaMail.e...,Re: Weekly Status Meeting
1194,horton-s/sent_items/46.,Message-ID: <26266749.1075855412636.JavaMail.e...,RE: Happy Holidays


### Feature Engineering

In [4]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to', 'subject']
    for line in lines:
        if ':' not in line:
            message += line.replace('-----Original Message-----', '. ').strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[-1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email
def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': map_to_list(emails, 'body'),
        'to': map_to_list(emails, 'to'),
        'from': map_to_list(emails, 'from'),
        'subject': map_to_list(emails, 'subject')
    }
    
def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

In [5]:
email_df = pd.DataFrame(parse_into_emails(df.message))
email_df.sample(10)
email_df.shape

(1500, 4)

In [6]:
# drop any records with both subject and body missing
missing = (email_df.body == '') & (email_df.subject == '')
email_df = email_df[~missing]
email_df.shape

(1499, 4)

### Data Analysis based on keywords

In [7]:
import re

# Keywords to search for
keywords = [
    "System Down", 
    "Emergency",
    "Alert"
    "Critical",
    "Urgent",
    "ASAP",
    "Important",
    "Action Required",
    "Deadline Approaching",
    "Immediate",
    "Immediately"
    "Quick",
    "Error",
    "EOD",
    "End of Day",
    "High Priority",
    "High Severity",
    "Overdue",
    "Due",
    "Quick"
    ]

# Create a regex pattern for the keywords
pattern = "(" + "|".join(keywords) + ")"
re = re.compile(pattern, flags=re.IGNORECASE)

# Search for the keywords in the subject and body
urgent_emails_df = email_df[email_df["subject"].str.contains(re) | email_df["body"].str.contains(re)]
urgent_emails_df.shape


  urgent_emails_df = email_df[email_df["subject"].str.contains(re) | email_df["body"].str.contains(re)]


(246, 4)

In [8]:
# Add column urgent/ non-urgent to the dataframe based on urgent_emails_df
email_df['urgent'] = email_df.index.isin(urgent_emails_df.index)

In [9]:
email_df.sample(10)
# export the dataframe to a csv file
email_df.to_csv('emails_cleaned.csv', index=False)

### Data Visualization