In [285]:
import json

import pandas as pd
import spacy

In [286]:
enron_df = pd.read_pickle('enron_students.pkl')
enron_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Date            517401 non-null  object
 1   From            517401 non-null  object
 2   To              495554 non-null  object
 3   X-To            517372 non-null  object
 4   X-From          517372 non-null  object
 5   X-cc            517372 non-null  object
 6   X-bcc           517372 non-null  object
 7   Subject         517401 non-null  object
 8   email_body      517401 non-null  object
 9   verdict         517401 non-null  object
 10  violated_rules  517401 non-null  object
dtypes: object(11)
memory usage: 43.4+ MB


In [287]:
# constans for run

SHOULD_RUN_ON_SAMPLE = False # Allow quick run for testing purposes todo maor: change to False
SAMPLE_SIZE = 5000
RANDOM_STATE = 3
MAX_DOC_SIZE_FOR_SPACY = 2000

In [288]:
# pre process
def remove_attachment_text(text):
    keywords = [
        'This is a multi-part message in MIME format',
        'Content-Disposition: attachment;',
        'Content-Type: multipart/mixed;'
    ]
    
    for keyword in keywords:
        index = text.find(keyword)
        if index != -1:
            return text[:index]  # Return the text up to the found keyword
    return text  # Return the original text if no keywords are found
    

def parse_contacts(data):
    # count the number of '~' in the data to check if its notes
    count = data.count('~')
    if count < 30:
        return data

    # Normalize the data by removing line continuation characters
    data = data.replace("=\n", "")  # Assumes `=` at the end of the line followed by a newline

    # Split the data into individual records on '#'
    records = data.split('#')

    # Initialize a list to store parsed contacts
    contacts = []

    # Iterate through each record
    for record in records:
        # Split the record into fields using '~'
        fields = record.split('~')

        # TODO: need to check relevant fields
        if len(fields) > 21:  # Check to ensure it's a valid record
            contact = {
                'first_name': fields[1].strip(),
                'last_name': fields[3].strip(),
                'phone_numbers': fields[11:14],
                'position': fields[15].strip(),
                'company': fields[18].strip(),
                'email': fields[21].strip() if len(fields) > 21 else None  # Safeguard for missing email
            }
            contacts.append(contact)
            # print('contact:', contact)

    if not contacts:
        return ' '
    return json.dumps(contacts)


def remove_substring(text, start, end):
    # Regex pattern to match a substring that starts with 'X-d' and ends with 'subject'
    pattern = rf'{start}.*?{end}'
    # Replace the matching substring with an empty string
    cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL)
    return cleaned_text

# remove forward text
def remove_forward_text(text: str):
    text = text.replace('FW:', '')
    text = text.replace('RE:', '')
    text = remove_substring(text, '----------- Forwarded', 'Subject')
    text = text.replace('-', '')
    text = text.replace('\\n', ' ')
    text = text.replace(':', ' ')
    return text
    

def data_cleaning(enron_df):
    # enron_df['email_text'] = enron_df['email_text'].apply(remove_attachment_text)
    # print('removed attachments')

    # remove attachment file if remove_attachment_text is true
    # Remove rows where remove_attachment_text returns True
    enron_df['email_text'] = enron_df['email_text'].apply(remove_attachment_text)

    # print('removed attachments')

    enron_df['email_text'] = enron_df['email_text'].apply(remove_forward_text)
    print('removed foward text')

    enron_df['email_text'] = enron_df['email_text'].apply(parse_contacts)
    print('parsed contacts')

    enron_df['email_text'] = enron_df['email_text'].fillna(' ')
    
enron_df['email_text'] = enron_df['Subject'] + ' ' + enron_df['email_body']
data_cleaning(enron_df)


removed foward text
parsed contacts


In [289]:
violating_rule_2_df = enron_df[enron_df['violated_rules'].str.contains('2.', na=False)]
non_violating_rule_2_df = enron_df[enron_df['violated_rules'].str.contains('2.', na=False) == False]


nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1500000

def print_lables(email_body):
    doc = nlp(email_body)
    print(f'Email body: {email_body} | Entities: {[(ent.text, ent.label_) for ent in doc.ents]}')

counter = 0
truncated = 0
def get_entities(document):
    global counter
    global truncated
    counter+= 1
    if counter % 200 == 0:
        print(f'Processed {counter} documents')
    try:
        doc_len = len(document)
        if doc_len > MAX_DOC_SIZE_FOR_SPACY:
            document = document[:MAX_DOC_SIZE_FOR_SPACY]
            truncated+= 1
        doc = nlp(document)
        return [(ent.text, ent.label_) for ent in doc.ents]

    except ValueError as e :
        print(f'Error in document. Error: {e}')
        return []

if SHOULD_RUN_ON_SAMPLE:
    test_df = violating_rule_2_df.sample(SAMPLE_SIZE, random_state = RANDOM_STATE).copy()
else:
    test_df = violating_rule_2_df.copy()
    
print(f"I've copied the DF of size: {test_df.shape}")
test_df.head()
test_df['entities'] = test_df['email_text'].apply(get_entities)
print(f'Truncated {truncated} documents')
test_df.head(200)





I've copied the DF of size: (34893, 12)
Processed 200 documents
Processed 400 documents
Processed 600 documents
Processed 800 documents
Processed 1000 documents
Processed 1200 documents
Processed 1400 documents
Processed 1600 documents
Processed 1800 documents
Processed 2000 documents
Processed 2200 documents
Processed 2400 documents
Processed 2600 documents
Processed 2800 documents
Processed 3000 documents
Processed 3200 documents
Processed 3400 documents
Processed 3600 documents
Processed 3800 documents
Processed 4000 documents
Processed 4200 documents
Processed 4400 documents
Processed 4600 documents
Processed 4800 documents
Processed 5000 documents
Processed 5200 documents
Processed 5400 documents
Processed 5600 documents
Processed 5800 documents
Processed 6000 documents
Processed 6200 documents
Processed 6400 documents
Processed 6600 documents
Processed 6800 documents
Processed 7000 documents
Processed 7200 documents
Processed 7400 documents
Processed 7600 documents
Processed 7800

Unnamed: 0,Date,From,To,X-To,X-From,X-cc,X-bcc,Subject,email_body,verdict,violated_rules,email_text,entities
8,"Tue, 17 Oct 2000 02:26:00 -0700 (PDT)",phillip.allen@enron.com,mark.scott@enron.com,Mark Scott,Phillip K Allen,,,Re: High Speed Internet Access,1. login: pallen pw: ke9davis\n\n I don't thi...,BLOCK,2.3,Re High Speed Internet Access 1. login pall...,"[(ISP \n\n 2, ORG), (IP, ORG), (IP, ORG), (25..."
9,"Mon, 16 Oct 2000 06:44:00 -0700 (PDT)",phillip.allen@enron.com,zimam@enron.com,zimam@enron.com,Phillip K Allen,,,FW: fixed forward or other Collar floor gas pr...,---------------------- Forwarded by Phillip K ...,BLOCK,2.1,fixed forward or other Collar floor gas price...,"[(Phillip, PERSON), (Microturbine, PERSON), (H..."
10,"Mon, 16 Oct 2000 06:42:00 -0700 (PDT)",phillip.allen@enron.com,buck.buckner@honeywell.com,"""Buckner, Buck"" <buck.buckner@honeywell.com> @...",Phillip K Allen,,,Re: FW: fixed forward or other Collar floor ga...,"Mr. Buckner,\n\n For delivered gas behind San ...",BLOCK,"1.3,2.1",Re fixed forward or other Collar floor gas p...,"[(Buckner, PERSON), (San Diego, GPE), (Enron E..."
41,"Tue, 19 Sep 2000 09:35:00 -0700 (PDT)",phillip.allen@enron.com,pallen70@hotmail.com,pallen70@hotmail.com,Phillip K Allen,,,Westgate Proforma-Phillip Allen.xls,---------------------- Forwarded by Phillip K ...,BLOCK,"1.3,2.1,2.3",Westgate ProformaPhillip Allen.xls Westgate ...,"[(Westgate ProformaPhillip Allen.xls, PERSON),..."
51,"Mon, 11 Sep 2000 09:57:00 -0700 (PDT)",phillip.allen@enron.com,keith.holst@enron.com,Keith Holst,Phillip K Allen,,,Westgate Proforma-Phillip Allen.xls,---------------------- Forwarded by Phillip K ...,BLOCK,2.1,Westgate ProformaPhillip Allen.xls Westgate ...,"[(Westgate ProformaPhillip Allen.xls, PERSON),..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2239,"Tue, 14 Aug 2001 14:43:26 -0700 (PDT)",k..allen@enron.com,tec@editingco.com,'tec@editingco.com',"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",'pallen70@hotmail.com',,,"Richard,\n\nHere is the power point presentati...",BLOCK,"1.3,2.3","Richard,\n\nHere is the power point presentat...","[(Richard, PERSON), (tomorrow, DATE), (8 AM, T..."
2240,"Wed, 15 Aug 2001 07:26:01 -0700 (PDT)",k..allen@enron.com,rickm@wt.net,'rickm@wt.net',"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",,,FW:,\n\n -----Original Message-----\nFrom: \tAllen...,BLOCK,"1.3,2.3","\n\n Original Message\nFrom \tAllen, Phillip...","[(Phillip K. \nSent, PERSON), (Tuesday, DATE)..."
2277,"Mon, 10 Sep 2001 06:56:07 -0700 (PDT)",k..allen@enron.com,mery.l.brown@accenture.com,'mery.l.brown@accenture.com@ENRON' <IMCEANOTES...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",,,RE: Simulation Common Mistakes,Can you send me a schedule of meetings? Is th...,BLOCK,"1.3,2.2",Simulation Common Mistakes Can you send me a ...,"[(today, DATE), (Phillip, PERSON), (IMCEANOTES..."
2281,"Mon, 10 Sep 2001 11:55:36 -0700 (PDT)",k..allen@enron.com,yevgeny.frolov@enron.com,"Frolov, Yevgeny </O=ENRON/OU=NA/CN=RECIPIENTS/...","Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",,,RE: Outing Event on Ultra Sailing Yacht set up...,"Yevgeny,\n\nCan you send me a schedule of meet...",BLOCK,2.3,Outing Event on Ultra Sailing Yacht set up fo...,"[(Ultra Sailing Yacht, ORG), (Enron/Accenture\..."


In [290]:
test_df.head(200)

QUIDS = ['ORG', 'GPE', 'LOW', 'FAC', 'LOC']
SENSITIVE = ['MONEY', 'PERCENT', 'NORP', 'PRODUCT', 'EVENT']
POTENTIALLY_SENSITIVE = ['DATE', 'TIME', 'QUANTITY', 'ORDINAL', 'CARDINAL', 'PERSON']
unique_entities = set()

for entities in test_df['entities']:
    for ent in entities:
        unique_entities.add(ent[1])
        
test_df['sensitive'] = test_df['entities'].apply(lambda x: [(ent[0], ent[1]) for ent in x if ent[1] in SENSITIVE])
test_df['quids'] = test_df['entities'].apply(lambda x: [(ent[0], ent[1]) for ent in x if ent[1] in QUIDS])
test_df['potentially_sensitive'] = test_df['entities'].apply(lambda x: [(ent[0], ent[1]) for ent in x if ent[1] in POTENTIALLY_SENSITIVE])

test_df.head(200)


Unnamed: 0,Date,From,To,X-To,X-From,X-cc,X-bcc,Subject,email_body,verdict,violated_rules,email_text,entities,sensitive,quids,potentially_sensitive
8,"Tue, 17 Oct 2000 02:26:00 -0700 (PDT)",phillip.allen@enron.com,mark.scott@enron.com,Mark Scott,Phillip K Allen,,,Re: High Speed Internet Access,1. login: pallen pw: ke9davis\n\n I don't thi...,BLOCK,2.3,Re High Speed Internet Access 1. login pall...,"[(ISP \n\n 2, ORG), (IP, ORG), (IP, ORG), (25...",[],"[(ISP \n\n 2, ORG), (IP, ORG), (IP, ORG), (DN...","[(255.255.255.248, CARDINAL), (151.164.1.8, CA..."
9,"Mon, 16 Oct 2000 06:44:00 -0700 (PDT)",phillip.allen@enron.com,zimam@enron.com,zimam@enron.com,Phillip K Allen,,,FW: fixed forward or other Collar floor gas pr...,---------------------- Forwarded by Phillip K ...,BLOCK,2.1,fixed forward or other Collar floor gas price...,"[(Phillip, PERSON), (Microturbine, PERSON), (H...","[(Burnertip, NORP)]","[(Honeywell, ORG), (San Diego, GPE), (San Dieg...","[(Phillip, PERSON), (Microturbine, PERSON), (1..."
10,"Mon, 16 Oct 2000 06:42:00 -0700 (PDT)",phillip.allen@enron.com,buck.buckner@honeywell.com,"""Buckner, Buck"" <buck.buckner@honeywell.com> @...",Phillip K Allen,,,Re: FW: fixed forward or other Collar floor ga...,"Mr. Buckner,\n\n For delivered gas behind San ...",BLOCK,"1.3,2.1",Re fixed forward or other Collar floor gas p...,"[(Buckner, PERSON), (San Diego, GPE), (Enron E...",[],"[(San Diego, GPE), (Enron Energy Services, ORG...","[(Buckner, PERSON), (7138537107, DATE), (Phill..."
41,"Tue, 19 Sep 2000 09:35:00 -0700 (PDT)",phillip.allen@enron.com,pallen70@hotmail.com,pallen70@hotmail.com,Phillip K Allen,,,Westgate Proforma-Phillip Allen.xls,---------------------- Forwarded by Phillip K ...,BLOCK,"1.3,2.1,2.3",Westgate ProformaPhillip Allen.xls Westgate ...,"[(Westgate ProformaPhillip Allen.xls, PERSON),...","[(over 40%, PERCENT), (680,000, MONEY)]","[(Austin, GPE), (the ""Smart Growth Corridor, O...","[(Westgate ProformaPhillip Allen.xls, PERSON),..."
51,"Mon, 11 Sep 2000 09:57:00 -0700 (PDT)",phillip.allen@enron.com,keith.holst@enron.com,Keith Holst,Phillip K Allen,,,Westgate Proforma-Phillip Allen.xls,---------------------- Forwarded by Phillip K ...,BLOCK,2.1,Westgate ProformaPhillip Allen.xls Westgate ...,"[(Westgate ProformaPhillip Allen.xls, PERSON),...","[(over 40%, PERCENT), (680,000, MONEY)]","[(Austin, GPE), (the ""Smart Growth Corridor, O...","[(Westgate ProformaPhillip Allen.xls, PERSON),..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2239,"Tue, 14 Aug 2001 14:43:26 -0700 (PDT)",k..allen@enron.com,tec@editingco.com,'tec@editingco.com',"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",'pallen70@hotmail.com',,,"Richard,\n\nHere is the power point presentati...",BLOCK,"1.3,2.3","Richard,\n\nHere is the power point presentat...","[(Richard, PERSON), (tomorrow, DATE), (8 AM, T...",[],[],"[(Richard, PERSON), (tomorrow, DATE), (8 AM, T..."
2240,"Wed, 15 Aug 2001 07:26:01 -0700 (PDT)",k..allen@enron.com,rickm@wt.net,'rickm@wt.net',"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",,,FW:,\n\n -----Original Message-----\nFrom: \tAllen...,BLOCK,"1.3,2.3","\n\n Original Message\nFrom \tAllen, Phillip...","[(Phillip K. \nSent, PERSON), (Tuesday, DATE)...",[],[],"[(Phillip K. \nSent, PERSON), (Tuesday, DATE)..."
2277,"Mon, 10 Sep 2001 06:56:07 -0700 (PDT)",k..allen@enron.com,mery.l.brown@accenture.com,'mery.l.brown@accenture.com@ENRON' <IMCEANOTES...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",,,RE: Simulation Common Mistakes,Can you send me a schedule of meetings? Is th...,BLOCK,"1.3,2.2",Simulation Common Mistakes Can you send me a ...,"[(today, DATE), (Phillip, PERSON), (IMCEANOTES...",[],"[(Cc \tFrolov, ORG), (tim.orourke@enron.com, O...","[(today, DATE), (Phillip, PERSON), (IMCEANOTES..."
2281,"Mon, 10 Sep 2001 11:55:36 -0700 (PDT)",k..allen@enron.com,yevgeny.frolov@enron.com,"Frolov, Yevgeny </O=ENRON/OU=NA/CN=RECIPIENTS/...","Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",,,RE: Outing Event on Ultra Sailing Yacht set up...,"Yevgeny,\n\nCan you send me a schedule of meet...",BLOCK,2.3,Outing Event on Ultra Sailing Yacht set up fo...,"[(Ultra Sailing Yacht, ORG), (Enron/Accenture\...","[(Dutch, NORP), (Reese, NORP), (Solis, NORP)]","[(Ultra Sailing Yacht, ORG), (Enron/Accenture\...","[(Yevgeny, PERSON), (today, DATE), (this after..."


In [291]:
class PII:
    def __init__(self, entity_type, score, text):
        self.entity_type = entity_type
        self.score = score
        self.text = text
        
    def __str__(self):
        return f'(Entity: {self.entity_type} | Score: {self.score} | Text: {self.text})'

In [292]:
from presidio_analyzer import AnalyzerEngine

analyzer = AnalyzerEngine()

counter = 0
def analyze_pii(email_body_text):
    global counter
    counter+= 1
    if counter % 200 == 0:
        print(f'Processed {counter} documents')
    pii_list = []
    email_body_text = email_body_text[:2000]
    
    try:
        results = analyzer.analyze(text=email_body_text, language='en')
    except Exception as e:
        print(f'Error in document. Error: {e}')
        return []
    
    for result in results:
        pii_text = build_pii_text(email_body_text, result)
        pii = PII(result.entity_type, result.score, pii_text)
        pii_list.append(pii)

    return pii_list


def build_pii_text(email_body_text, result):
    start = result.start
    end = result.end
    pii_text = email_body_text[start:end]
    return pii_text

In [293]:
new_df = test_df.copy()
new_df['pii'] = new_df['email_text'].apply(lambda x: analyze_pii(x))


Processed 200 documents
Processed 400 documents
Processed 600 documents
Processed 800 documents
Processed 1000 documents
Processed 1200 documents
Processed 1400 documents
Processed 1600 documents
Processed 1800 documents
Processed 2000 documents
Processed 2200 documents
Processed 2400 documents
Processed 2600 documents
Processed 2800 documents
Processed 3000 documents
Processed 3200 documents
Processed 3400 documents
Processed 3600 documents
Processed 3800 documents
Processed 4000 documents
Processed 4200 documents
Processed 4400 documents
Processed 4600 documents
Processed 4800 documents
Processed 5000 documents
Processed 5200 documents
Processed 5400 documents
Processed 5600 documents
Processed 5800 documents
Processed 6000 documents
Processed 6200 documents
Processed 6400 documents
Processed 6600 documents
Processed 6800 documents
Processed 7000 documents
Processed 7200 documents
Processed 7400 documents
Processed 7600 documents
Processed 7800 documents
Processed 8000 documents
Proc

In [294]:
new_df.head(200)

Unnamed: 0,Date,From,To,X-To,X-From,X-cc,X-bcc,Subject,email_body,verdict,violated_rules,email_text,entities,sensitive,quids,potentially_sensitive,pii
8,"Tue, 17 Oct 2000 02:26:00 -0700 (PDT)",phillip.allen@enron.com,mark.scott@enron.com,Mark Scott,Phillip K Allen,,,Re: High Speed Internet Access,1. login: pallen pw: ke9davis\n\n I don't thi...,BLOCK,2.3,Re High Speed Internet Access 1. login pall...,"[(ISP \n\n 2, ORG), (IP, ORG), (IP, ORG), (25...",[],"[(ISP \n\n 2, ORG), (IP, ORG), (IP, ORG), (DN...","[(255.255.255.248, CARDINAL), (151.164.1.8, CA...",[(Entity: IP_ADDRESS | Score: 0.95 | Text: 64....
9,"Mon, 16 Oct 2000 06:44:00 -0700 (PDT)",phillip.allen@enron.com,zimam@enron.com,zimam@enron.com,Phillip K Allen,,,FW: fixed forward or other Collar floor gas pr...,---------------------- Forwarded by Phillip K ...,BLOCK,2.1,fixed forward or other Collar floor gas price...,"[(Phillip, PERSON), (Microturbine, PERSON), (H...","[(Burnertip, NORP)]","[(Honeywell, ORG), (San Diego, GPE), (San Dieg...","[(Phillip, PERSON), (Microturbine, PERSON), (1...","[(Entity: PERSON | Score: 0.85 | Text: degen),..."
10,"Mon, 16 Oct 2000 06:42:00 -0700 (PDT)",phillip.allen@enron.com,buck.buckner@honeywell.com,"""Buckner, Buck"" <buck.buckner@honeywell.com> @...",Phillip K Allen,,,Re: FW: fixed forward or other Collar floor ga...,"Mr. Buckner,\n\n For delivered gas behind San ...",BLOCK,"1.3,2.1",Re fixed forward or other Collar floor gas p...,"[(Buckner, PERSON), (San Diego, GPE), (Enron E...",[],"[(San Diego, GPE), (Enron Energy Services, ORG...","[(Buckner, PERSON), (7138537107, DATE), (Phill...",[(Entity: UK_NHS | Score: 1.0 | Text: 71385371...
41,"Tue, 19 Sep 2000 09:35:00 -0700 (PDT)",phillip.allen@enron.com,pallen70@hotmail.com,pallen70@hotmail.com,Phillip K Allen,,,Westgate Proforma-Phillip Allen.xls,---------------------- Forwarded by Phillip K ...,BLOCK,"1.3,2.1,2.3",Westgate ProformaPhillip Allen.xls Westgate ...,"[(Westgate ProformaPhillip Allen.xls, PERSON),...","[(over 40%, PERCENT), (680,000, MONEY)]","[(Austin, GPE), (the ""Smart Growth Corridor, O...","[(Westgate ProformaPhillip Allen.xls, PERSON),...",[(Entity: LOCATION | Score: 0.85 | Text: Austi...
51,"Mon, 11 Sep 2000 09:57:00 -0700 (PDT)",phillip.allen@enron.com,keith.holst@enron.com,Keith Holst,Phillip K Allen,,,Westgate Proforma-Phillip Allen.xls,---------------------- Forwarded by Phillip K ...,BLOCK,2.1,Westgate ProformaPhillip Allen.xls Westgate ...,"[(Westgate ProformaPhillip Allen.xls, PERSON),...","[(over 40%, PERCENT), (680,000, MONEY)]","[(Austin, GPE), (the ""Smart Growth Corridor, O...","[(Westgate ProformaPhillip Allen.xls, PERSON),...",[(Entity: LOCATION | Score: 0.85 | Text: Austi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2239,"Tue, 14 Aug 2001 14:43:26 -0700 (PDT)",k..allen@enron.com,tec@editingco.com,'tec@editingco.com',"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",'pallen70@hotmail.com',,,"Richard,\n\nHere is the power point presentati...",BLOCK,"1.3,2.3","Richard,\n\nHere is the power point presentat...","[(Richard, PERSON), (tomorrow, DATE), (8 AM, T...",[],[],"[(Richard, PERSON), (tomorrow, DATE), (8 AM, T...",[(Entity: UK_NHS | Score: 1.0 | Text: 71346386...
2240,"Wed, 15 Aug 2001 07:26:01 -0700 (PDT)",k..allen@enron.com,rickm@wt.net,'rickm@wt.net',"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",,,FW:,\n\n -----Original Message-----\nFrom: \tAllen...,BLOCK,"1.3,2.3","\n\n Original Message\nFrom \tAllen, Phillip...","[(Phillip K. \nSent, PERSON), (Tuesday, DATE)...",[],[],"[(Phillip K. \nSent, PERSON), (Tuesday, DATE)...",[(Entity: EMAIL_ADDRESS | Score: 1.0 | Text: t...
2277,"Mon, 10 Sep 2001 06:56:07 -0700 (PDT)",k..allen@enron.com,mery.l.brown@accenture.com,'mery.l.brown@accenture.com@ENRON' <IMCEANOTES...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",,,RE: Simulation Common Mistakes,Can you send me a schedule of meetings? Is th...,BLOCK,"1.3,2.2",Simulation Common Mistakes Can you send me a ...,"[(today, DATE), (Phillip, PERSON), (IMCEANOTES...",[],"[(Cc \tFrolov, ORG), (tim.orourke@enron.com, O...","[(today, DATE), (Phillip, PERSON), (IMCEANOTES...",[(Entity: EMAIL_ADDRESS | Score: 1.0 | Text: m...
2281,"Mon, 10 Sep 2001 11:55:36 -0700 (PDT)",k..allen@enron.com,yevgeny.frolov@enron.com,"Frolov, Yevgeny </O=ENRON/OU=NA/CN=RECIPIENTS/...","Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",,,RE: Outing Event on Ultra Sailing Yacht set up...,"Yevgeny,\n\nCan you send me a schedule of meet...",BLOCK,2.3,Outing Event on Ultra Sailing Yacht set up fo...,"[(Ultra Sailing Yacht, ORG), (Enron/Accenture\...","[(Dutch, NORP), (Reese, NORP), (Solis, NORP)]","[(Ultra Sailing Yacht, ORG), (Enron/Accenture\...","[(Yevgeny, PERSON), (today, DATE), (this after...",[(Entity: EMAIL_ADDRESS | Score: 1.0 | Text: e...


In [295]:
unique_entities_presidio = set()
for index, row in new_df.iterrows():
    for pii in row['pii']:
        unique_entities_presidio.add(pii.entity_type)
        
print(unique_entities_presidio)
print(f'Number of unique entities: {len(unique_entities_presidio)}')

{'EMAIL_ADDRESS', 'IP_ADDRESS', 'AU_ACN', 'URL', 'US_ITIN', 'IN_AADHAAR', 'LOCATION', 'UK_NHS', 'AU_TFN', 'US_BANK_NUMBER', 'IN_PAN', 'PERSON', 'DATE_TIME', 'US_DRIVER_LICENSE', 'IN_VEHICLE_REGISTRATION', 'SG_NRIC_FIN', 'AU_MEDICARE', 'US_SSN', 'US_PASSPORT', 'MEDICAL_LICENSE', 'NRP', 'AU_ABN', 'CREDIT_CARD', 'PHONE_NUMBER'}
Number of unique entities: 24


In [296]:
import re


def find_all_ssn(text):
    return re.findall(r'\d{3}-\d{2}-\d{4}', text)

def find_all_credit_cards(text):
    return re.findall(r'\d{4}-\d{4}-\d{4}-\d{4}', text) 

def find_all_phone_numbers(text):
    return re.findall(r'\(?\d{3}\)?\s*-\s*\d{3}\s*-\s*\d{4}', text) 

def find_sensitive_words(text):
    return re.findall(r'password|attach|confidential', text.lower())


text = "Wow. my my password is: abc31234 . Go and buy flight tickets with my credit card: 9591-9878-3320-0031. SSN: 123-45-6789. My phone number is (531)-223-7709', '(531) - 223 - 7709' or 531 -223-7709', 713-646-3490 '(531) - 223 - 7709'. I live in New York. I am 25 years old. I was born on 12/12/1996. I am 1.75m tall. attached as a file my password: 20212. attachment"

print(find_all_ssn(text))
print(find_all_credit_cards(text))
print(find_all_phone_numbers(text))
print(find_sensitive_words(text))

['123-45-6789']
['9591-9878-3320-0031']
['(531)-223-7709', '(531) - 223 - 7709', '531 -223-7709', '713-646-3490', '(531) - 223 - 7709']
['password', 'attach', 'password', 'attach']


In [297]:
NON_SENSITIVE = ['EMAIL_ADDRESS', 'URL', 'ORG']
SENSITIVE = ['IP_ADDRESS', 'AU_ACN', 'US_ITIN', 'UK_NHS', 'AU_TFN', 'US_BANK_NUMBER', 'IN_PAN', 'US_DRIVER_LICENSE', 'IN_VEHICLE_REGISTRATION', 'SG_NRIC_FIN', 'US_SSN', 'US_PASSPORT', 'MEDICAL_LICENSE', 'PHONE_NUMBER']
QUASI_SENSITIVE = ['NRP', 'LOCATION', 'PERSON', 'DATE_TIME', 'GPE']

def predict_presidio_verdict(row):
    persons_counter = 0
    if row is None:
        return 'Non-sensitive'
    
    email_text = row['email_text']
    if email_text is None:
        return 'Non-sensitive'
    if len(find_sensitive_words(row['email_text'])) > 0 or len(find_all_phone_numbers(email_text)) > 0 or len(
            find_all_credit_cards(email_text)) > 0 or len(find_all_ssn(email_text)) > 0:
        return 'Sensitive'

    if row['pii'] is None:
        return 'Non-sensitive'
    for pii in row['pii']:
        if pii.score < 0.5:
            continue
        if pii.entity_type.startswith('IN_'):
            continue
        if pii.entity_type in NON_SENSITIVE:
            continue
        if pii.entity_type in SENSITIVE:
            return 'Sensitive'
        if pii.entity_type in QUASI_SENSITIVE: # Todo maor - if we have 2 or more QUIDS, we consider it sensitive in 2.3
            if pii.entity_type == 'PERSON':
                persons_counter += 1
                if persons_counter > 2:
                    return 'Sensitive'
            else:
                return 'Sensitive'
            
        else:
            print(f'MA ZE: {ent}')
        
    return 'Non-sensitive'
    

def predict_spacy_verdict(row):
    count_persons = 0
    if len(find_sensitive_words(row['email_text'])) > 0:
        return 'Sensitive'
    if row['quids'] and len(row['quids']) > 0: # Todo maor - if we have 2 or more QUIDS, we consider it sensitive in 2.3
        return 'Sensitive'
    elif row['sensitive'] and len(row['sensitive']) > 0:
        return 'Sensitive'
    elif row['potentially_sensitive'] and len(row['potentially_sensitive']) > 0:
        for ent in row['potentially_sensitive']:
            if ent[1] == 'DATE':
                continue
            elif ent[1] == 'TIME':
                continue
            elif ent[1] == 'QUANTITY':
                continue
            elif ent[1] == 'ORDINAL':
                continue
            elif ent[1] == 'CARDINAL':
                if len(find_all_phone_numbers(ent[0])) > 0 or len(find_all_credit_cards(ent[0])) > 0 or len(find_all_ssn(ent[0])) > 0:
                    return 'Sensitive'
            elif ent[1] == 'PERSON':
                count_persons += 1
                if count_persons > 2:
                    return 'Sensitive'
                continue
        return 'Non-sensitive'
    else:
        return 'Non-sensitive'
    
new_df['spacy_verdict'] = new_df.apply(predict_spacy_verdict, axis=1)
new_df.head(10)
new_df['presidio_verdict'] = new_df.apply(predict_presidio_verdict, axis=1)
spacy_didnt_blocked_but_blocked_in_dataset = new_df[new_df['spacy_verdict'] == 'Non-sensitive']
spacy_didnt_blocked_but_blocked_in_dataset.head(200)

presidio_didnt_blocked_but_blocked_in_dataset = new_df[new_df['presidio_verdict'] == 'Non-sensitive']
presidio_didnt_blocked_but_blocked_in_dataset.head(200)

presidio_accuracy = 100 - (100 * len(presidio_didnt_blocked_but_blocked_in_dataset) / len(new_df))
spacy_accuracy = 100 -(100 * len(spacy_didnt_blocked_but_blocked_in_dataset) / len(new_df))
print(f'Presidio accuracy: {presidio_accuracy}% with {len(presidio_didnt_blocked_but_blocked_in_dataset)} emails left out of {len(new_df)}')
print(f'Spacy accuracy: {spacy_accuracy}% with {len(spacy_didnt_blocked_but_blocked_in_dataset)} emails left out of {len(new_df)}')

MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chicago', 'GPE')
MA ZE: ('Chi

In [298]:
presidio_didnt_blocked_but_blocked_in_dataset.head(200)

Unnamed: 0,Date,From,To,X-To,X-From,X-cc,X-bcc,Subject,email_body,verdict,violated_rules,email_text,entities,sensitive,quids,potentially_sensitive,pii,spacy_verdict,presidio_verdict
251,"Mon, 7 Feb 2000 08:53:00 -0800 (PST)",phillip.allen@enron.com,george.rahal@acnpower.com,"""George Rahal"" <george.rahal@acnpower.com> @ E...",Phillip K Allen,,,Re: W basis quotes,"George,\n\n Can you please call my credit desk...",BLOCK,"1.3,2.1","Re W basis quotes George,\n\n Can you please ...","[(George, PERSON), (7138531803, DATE), (ACN Po...",[],"[(ACN Power, ORG)]","[(George, PERSON), (7138531803, DATE), (Philli...",[(Entity: PERSON | Score: 0.85 | Text: George)...,Sensitive,Non-sensitive
599,"Mon, 23 Oct 2000 08:55:00 -0700 (PDT)",phillip.allen@enron.com,jedglick@hotmail.com,Jedglick@hotmail.com,Phillip K Allen,,,Enron,"Jed,\n\n I understand you have been contacted ...",BLOCK,"1.3,2.1","Enron Jed,\n\n I understand you have been cont...","[(Enron Jed, PERSON), (Enron, ORG), (Phillip A...",[],"[(Enron, ORG)]","[(Enron Jed, PERSON), (Phillip Allen, PERSON),...",[(Entity: EMAIL_ADDRESS | Score: 1.0 | Text: p...,Sensitive,Non-sensitive
849,"Mon, 7 Feb 2000 08:53:00 -0800 (PST)",phillip.allen@enron.com,george.rahal@acnpower.com,"""George Rahal"" <george.rahal@acnpower.com> @ E...",Phillip K Allen,,,Re: W basis quotes,"George,\n\n Can you please call my credit desk...",BLOCK,"1.3,2.1","Re W basis quotes George,\n\n Can you please ...","[(George, PERSON), (7138531803, DATE), (ACN Po...",[],"[(ACN Power, ORG)]","[(George, PERSON), (7138531803, DATE), (Philli...",[(Entity: PERSON | Score: 0.85 | Text: George)...,Sensitive,Non-sensitive
1222,"Mon, 23 Oct 2000 08:55:00 -0700 (PDT)",phillip.allen@enron.com,jedglick@hotmail.com,Jedglick@hotmail.com,Phillip K Allen,,,Enron,"Jed,\n\n I understand you have been contacted ...",BLOCK,"1.3,2.1","Enron Jed,\n\n I understand you have been cont...","[(Enron Jed, PERSON), (Enron, ORG), (Phillip A...",[],"[(Enron, ORG)]","[(Enron Jed, PERSON), (Phillip Allen, PERSON),...",[(Entity: EMAIL_ADDRESS | Score: 1.0 | Text: p...,Sensitive,Non-sensitive
1665,"Mon, 23 Oct 2000 08:55:00 -0700 (PDT)",phillip.allen@enron.com,jedglick@hotmail.com,Jedglick@hotmail.com,Phillip K Allen,,,Enron,"Jed,\n\n I understand you have been contacted ...",BLOCK,"1.3,2.1","Enron Jed,\n\n I understand you have been cont...","[(Enron Jed, PERSON), (Enron, ORG), (Phillip A...",[],"[(Enron, ORG)]","[(Enron Jed, PERSON), (Phillip Allen, PERSON),...",[(Entity: EMAIL_ADDRESS | Score: 1.0 | Text: p...,Sensitive,Non-sensitive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316995,"Fri, 1 Jun 2001 00:54:00 -0700 (PDT)",lorie.leigh@enron.com,kay.mann@enron.com,Kay Mann,Lorie Leigh,,,FPL construction letter agreement - Midway Pha...,I sent a print out of this letter on letterhea...,BLOCK,2.1,FPL construction letter agreement Midway Phas...,"[(FPL, ORG), (Midway Phase, ORG), (Ben, PERSON...",[],"[(FPL, ORG), (Midway Phase, ORG)]","[(Ben, PERSON), (Lorie Leigh\nEast Power Gener...","[(Entity: PERSON | Score: 0.85 | Text: Ben), (...",Sensitive,Non-sensitive
323216,"Thu, 3 Jan 2002 15:02:46 -0800 (PST)",danielle.marcinkowski@enron.com,"rudy.acevedo@enron.com, k..allen@enron.com, to...","Acevedo, Rudy </O=ENRON/OU=NA/CN=RECIPIENTS/CN...","Marcinkowski, Danielle </O=ENRON/OU=NA/CN=RECI...",,,Reutes Kobra Changes,Market Data has recently made changes to the R...,BLOCK,"1.3,2.1",Reutes Kobra Changes Market Data has recently ...,"[(Reutes Kobra Changes Market Data, PERSON), (...",[],[],"[(Reutes Kobra Changes Market Data, PERSON), (...",[(Entity: PERSON | Score: 0.85 | Text: Reutes ...,Sensitive,Non-sensitive
323431,"Mon, 4 Feb 2002 12:15:08 -0800 (PST)",gary.bryan@enron.com,mary.grif.gray@enron.com,'mary.grif.gray@enron.com',"Bryan, Gary </O=ENRON/OU=NA/CN=RECIPIENTS/CN=G...","Vickers, Frank </O=ENRON/OU=NA/CN=RECIPIENTS/C...",,TXU Energy Trading Company,The contacts we previously had listed for TXU ...,BLOCK,2.1,TXU Energy Trading Company The contacts we pre...,"[(TXU Energy Trading Company The, ORG), (TXU, ...",[],"[(TXU Energy Trading Company The, ORG), (TXU, ...","[(three, CARDINAL), (Michael Adams, Risk Contr...",[(Entity: EMAIL_ADDRESS | Score: 1.0 | Text: m...,Sensitive,Non-sensitive
324390,"Thu, 3 Jan 2002 15:02:46 -0800 (PST)",danielle.marcinkowski@enron.com,"rudy.acevedo@enron.com, k..allen@enron.com, to...","Acevedo, Rudy </O=ENRON/OU=NA/CN=RECIPIENTS/CN...","Marcinkowski, Danielle </O=ENRON/OU=NA/CN=RECI...",,,Reutes Kobra Changes,Market Data has recently made changes to the R...,BLOCK,"1.3,2.1",Reutes Kobra Changes Market Data has recently ...,"[(Reutes Kobra Changes Market Data, PERSON), (...",[],[],"[(Reutes Kobra Changes Market Data, PERSON), (...",[(Entity: PERSON | Score: 0.85 | Text: Reutes ...,Sensitive,Non-sensitive


In [299]:
spacy_didnt_blocked_but_blocked_in_dataset.head(200)

Unnamed: 0,Date,From,To,X-To,X-From,X-cc,X-bcc,Subject,email_body,verdict,violated_rules,email_text,entities,sensitive,quids,potentially_sensitive,pii,spacy_verdict,presidio_verdict
443,"Fri, 16 Feb 2001 02:15:00 -0800 (PST)",phillip.allen@enron.com,andrew_m_ozuna@mail.bankone.com,andrew_m_ozuna@mail.bankone.com,Phillip K Allen,,,,"Andrew,\n\nHere is an asset statement. I will...",BLOCK,2.3,"Andrew,\n\nHere is an asset statement. I wil...","[(Andrew, PERSON), (98, CARDINAL), (2000, DATE...",[],[],"[(Andrew, PERSON), (98, CARDINAL), (2000, DATE...",[(Entity: UK_NHS | Score: 1.0 | Text: 71346386...,Non-sensitive,Sensitive
448,"Thu, 15 Feb 2001 05:33:00 -0800 (PST)",phillip.allen@enron.com,lodonnell@spbank.com,lodonnell@spbank.com,Phillip K Allen,,,,"Lee,\n\nMy fax number is 713-646-2391. Please...",BLOCK,2.3,"Lee,\n\nMy fax number is 7136462391. Please ...","[(Lee, PERSON), (7136462391, DATE), (Phillip A...",[],[],"[(Lee, PERSON), (7136462391, DATE), (Phillip A...",[(Entity: EMAIL_ADDRESS | Score: 1.0 | Text: p...,Non-sensitive,Sensitive
1065,"Fri, 16 Feb 2001 02:15:00 -0800 (PST)",phillip.allen@enron.com,andrew_m_ozuna@mail.bankone.com,andrew_m_ozuna@mail.bankone.com,Phillip K Allen,,,,"Andrew,\n\nHere is an asset statement. I will...",BLOCK,2.3,"Andrew,\n\nHere is an asset statement. I wil...","[(Andrew, PERSON), (98, CARDINAL), (2000, DATE...",[],[],"[(Andrew, PERSON), (98, CARDINAL), (2000, DATE...",[(Entity: UK_NHS | Score: 1.0 | Text: 71346386...,Non-sensitive,Sensitive
1070,"Thu, 15 Feb 2001 05:33:00 -0800 (PST)",phillip.allen@enron.com,lodonnell@spbank.com,lodonnell@spbank.com,Phillip K Allen,,,,"Lee,\n\nMy fax number is 713-646-2391. Please...",BLOCK,2.3,"Lee,\n\nMy fax number is 7136462391. Please ...","[(Lee, PERSON), (7136462391, DATE), (Phillip A...",[],[],"[(Lee, PERSON), (7136462391, DATE), (Phillip A...",[(Entity: EMAIL_ADDRESS | Score: 1.0 | Text: p...,Non-sensitive,Sensitive
1827,"Thu, 15 Feb 2001 05:33:00 -0800 (PST)",phillip.allen@enron.com,lodonnell@spbank.com,lodonnell@spbank.com,Phillip K Allen,,,,"Lee,\n\nMy fax number is 713-646-2391. Please...",BLOCK,2.3,"Lee,\n\nMy fax number is 7136462391. Please ...","[(Lee, PERSON), (7136462391, DATE), (Phillip A...",[],[],"[(Lee, PERSON), (7136462391, DATE), (Phillip A...",[(Entity: EMAIL_ADDRESS | Score: 1.0 | Text: p...,Non-sensitive,Sensitive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288346,"Mon, 25 Mar 2002 08:47:08 -0800 (PST)",audrey.robertson@enron.com,"kevin.hyatt@enron.com, lindy.donoho@enron.com,...","Hyatt, Kevin </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...","Robertson, Audrey </O=ENRON/OU=NA/CN=RECIPIENT...","Winters, Ricki </O=ENRON/OU=NA/CN=RECIPIENTS/C...",,Strategy Meeting - TW Commercial Team,\nPer discussions in Shelley's introductory me...,BLOCK,"1.3,2.3",Strategy Meeting TW Commercial Team \nPer dis...,"[(Shelley, PERSON), (this morning, TIME), (Thu...",[],[],"[(Shelley, PERSON), (this morning, TIME), (Thu...",[(Entity: EMAIL_ADDRESS | Score: 1.0 | Text: a...,Non-sensitive,Sensitive
291845,"Mon, 2 Apr 2001 05:50:00 -0700 (PDT)",phillip.love@enron.com,oxkd@aol.com,oxkd@aol.com,Phillip M Love,,,,How are things going? Good I hope. Give me a...,BLOCK,2.3,How are things going? Good I hope. Give me ...,[],[],[],[],[(Entity: DATE_TIME | Score: 0.85 | Text: 7138...,Non-sensitive,Sensitive
292993,"Mon, 2 Apr 2001 05:50:00 -0700 (PDT)",phillip.love@enron.com,oxkd@aol.com,oxkd@aol.com,Phillip M Love,,,,How are things going? Good I hope. Give me a...,BLOCK,2.3,How are things going? Good I hope. Give me ...,[],[],[],[],[(Entity: DATE_TIME | Score: 0.85 | Text: 7138...,Non-sensitive,Sensitive
294358,"Mon, 2 Apr 2001 05:50:00 -0700 (PDT)",phillip.love@enron.com,oxkd@aol.com,oxkd@aol.com,Phillip M Love,,,,How are things going? Good I hope. Give me a...,BLOCK,2.3,How are things going? Good I hope. Give me ...,[],[],[],[],[(Entity: DATE_TIME | Score: 0.85 | Text: 7138...,Non-sensitive,Sensitive


In [300]:
# add a column of Sensitive into Block or Not
new_df['presidio_prediction']=new_df['presidio_verdict'].apply(lambda x: 'BLOCK' if x == 'Sensitive' else 'ALLOW')
new_df['spacy_prediction']=new_df['spacy_verdict'].apply(lambda x: 'BLOCK' if x == 'Sensitive' else 'ALLOW')
import sklearn.metrics


presidio_classification_report = sklearn.metrics.classification_report(new_df['verdict'], new_df['presidio_prediction'])
spacy_classification_report = sklearn.metrics.classification_report(new_df['verdict'], new_df['spacy_prediction'])
print('Spacy classification report:\n', spacy_classification_report)
print('\nPresidio classification report:\n', presidio_classification_report)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Spacy classification report:
               precision    recall  f1-score   support

       ALLOW       0.00      0.00      0.00         0
       BLOCK       1.00      0.99      0.99     34893

    accuracy                           0.99     34893
   macro avg       0.50      0.49      0.50     34893
weighted avg       1.00      0.99      0.99     34893


Presidio classification report:
               precision    recall  f1-score   support

       ALLOW       0.00      0.00      0.00         0
       BLOCK       1.00      0.99      0.99     34893

    accuracy                           0.99     34893
   macro avg       0.50      0.49      0.50     34893
weighted avg       1.00      0.99      0.99     34893



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
