## **Classifier POC - Urgent vs. Non-Urgent Messages**
### Data preparation

In [516]:
import pandas as pd
import numpy as np

df = pd.read_csv('emails_new.csv')
df.head(3)

Unnamed: 0,file,message
0,arnold-j/sent_items/315.,Message-ID: <31189540.1075852718882.JavaMail.e...
1,lenhart-m/sent/1210.,Message-ID: <932989.1075858142302.JavaMail.eva...
2,symes-k/sent/1136.,Message-ID: <20692876.1075841905289.JavaMail.e...


In [517]:
#extract subject from message
df['subject'] = df['message'].str.extract(r'Subject: (.*)\n')
#remove any RE: or FW: from the subject
df.subject.apply(lambda x: x.replace('RE:','').replace('FW:',''))

0                                                        
1                             Re: suicide pool - nfl 2000
2                                 Re: 4/10 checkout con't
3                                           Energy Issues
4                                             2001 budget
                              ...                        
1495                                   TW Weekly 11-17-00
1496                                       New York, etc.
1497     Letter re Unpaid Invoice for Post petition De...
1498                                                     
1499                    Tasters next thursday! (11-16-00)
Name: subject, Length: 1500, dtype: object

In [518]:
df.sample(10)

Unnamed: 0,file,message,subject
856,mann-k/_sent_mail/4045.,Message-ID: <3014784.1075846127593.JavaMail.ev...,Re: Agency Agreement
1298,stclair-c/all_documents/1001.,Message-ID: <31458590.1075842105073.JavaMail.e...,Re: Important United Way Reminder
842,jones-t/all_documents/5063.,Message-ID: <32757033.1075847038247.JavaMail.e...,Destec Properties Limited Partnership
1054,skilling-j/all_documents/169.,Message-ID: <4290768.1075840070871.JavaMail.ev...,Keith Dodson Hiring Form
892,germany-c/bankrupt/cleburne_lone_star/38.,Message-ID: <10437684.1075840493489.JavaMail.e...,FW: Lone Star for the 10th
234,rodrique-r/discussion_threads/228.,Message-ID: <4032859.1075858346313.JavaMail.ev...,VAR
1214,hernandez-j/notes_inbox/302.,Message-ID: <18575276.1075855558637.JavaMail.e...,FW: A Perfect World
1150,beck-s/eol/3.,Message-ID: <17379575.1075849852146.JavaMail.e...,RE: EOL Average Deal Count as of 3-14-01
1409,weldon-c/all_documents/88.,Message-ID: <23436220.1075851679788.JavaMail.e...,Re:
270,williams-w3/schedule_crawler/87.,Message-ID: <27427682.1075839950595.JavaMail.e...,Start Date: 2/3/02; HourAhead hour: 12;


### Feature Engineering

In [519]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to', 'subject']
    for line in lines:
        if ':' not in line:
            message += line.replace('-----Original Message-----', '. ').strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[-1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email
def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': map_to_list(emails, 'body'),
        'to': map_to_list(emails, 'to'),
        'from': map_to_list(emails, 'from'),
        'subject': map_to_list(emails, 'subject')
    }
    
def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

In [520]:
email_df = pd.DataFrame(parse_into_emails(df.message))
email_df.sample(10)
email_df.shape

(1500, 4)

In [521]:
# drop any records with both subject and body missing
missing = (email_df.body == '') & (email_df.subject == '')
email_df = email_df[~missing]
email_df.shape

(1499, 4)

## Data Analysis based on keywords

In [522]:
import re

# Keywords to search for
keywords = [
    "System Down", 
    "Emergency",
    "Alert"
    "Critical",
    "Urgent",
    "ASAP",
    "Important",
    "Action Required",
    "Deadline",
    "Immediate",
    "Immediately"
    "Quick",
    "Error",
    "EOD",
    "End of Day",
    "High Priority",
    "High Severity",
    "Overdue",
    "Due",
    "Quick"
    ]

# Create a regex pattern for the keywords
pattern = "(" + "|".join(keywords) + ")"
re = re.compile(pattern, flags=re.IGNORECASE)

# Search for the keywords in the subject and body
urgent_emails_df = email_df[email_df["subject"].str.contains(re) | email_df["body"].str.contains(re)]
urgent_emails_df.shape


  urgent_emails_df = email_df[email_df["subject"].str.contains(re) | email_df["body"].str.contains(re)]


(257, 4)

In [None]:
# Add column urgent/ non-urgent to the dataframe based on urgent_emails_df
email_df['urgent'] = email_df.index.isin(urgent_emails_df.index)

In [524]:
email_df.sample(10)

Unnamed: 0,body,to,from,subject,urgent
1078,Scud incoming....---------------------- Forwar...,"William S Bradford/HOU/ECT@ECT, Tanya Rohauer/...",30 AM,Imperial Holly Corporation,False
1010,"Daren - meter 5192 flowed 8 dth on 12/19, 33 d...",daren.farmer@enron.com,aimee.lannou@enron.com,Dec 00,False
141,"Joe,We are extremely pleased with the support ...",Vince J Kaminski/HOU/ECT@ECT,31 AM,Recent Hardware Repair,True
1215,Bob -Attached is the final version of the BETA...,bob.shults@enron.com,mark.greenberg@enron.com,BETA,False
1164,<<CSFB_EnergyTech_dec7.pdf>>* Siemens Westingh...,,owner-eqr_etcorporate@fir.fbc.com,Energy Technology Bulletin - Issue 40,False
527,"Tim,Thanks for the update..Here's the forecast...","Allen, Phillip K.","Heizenrader, Timothy",pnw-march1-forecast-gas-update-summary.xls,False
452,Is this okay for you - what hotel did you deci...,"Wadle, Susan",Marianne Long <marianne.long@travelpark.com>@E...,plz review and approve travel for kevin hyatt,False
233,"richard.sanders@enron.com, james.studdert@enro...",britt.davis@enron.com,paul.henking@enron.com,In re M/V PACIFIC VIRGO,True
658,Deal 312610 purchase at CentanaDeal 312613 Sal...,Daren J Farmer/HOU/ECT@ECT,daren.farmer@enron.com,PanEnergy Marketing March 2000 Production,False
1057,"Lousie,My group gathers the gas and power phys...","=09Hodges, Georganne; Helton, Susan","=09Guerra, Ricardo =20",=09North America Physical Volumes,False


### Data Visualization