 ### Classes `Email` and `Conversation`
 
  - outsourced all pre-processing, parsing and structuring functionality to `process_emails.py`
  - `Email` (obviously) represents an e-mail, currently the fields of the header is parsed (just like in `parse_headers.ipynb?`) and the e-mail body is used as the raw text
  
  - `Conversation` takes a list of `Email` instances (it can also construct the list of e-mails which form a conversation from the data set) and from them extracts and structures lists of itnerlocutors, mentioned docs, etc
  

In [3]:
import os
import pickle
from tqdm import tqdm

import re
import html

# from collections import Counter
# import matplotlib.pyplot as plt

from class_declarations import Email, Conversation, Ledger

In [4]:
ledger = Ledger()

In [5]:
mails = []
for f in tqdm(os.listdir("files")):
    with open("files/"+f) as handle:
        mails.append(list(map(html.unescape, handle.readlines())))

100%|██████████| 161645/161645 [01:00<00:00, 2654.43it/s]


In [6]:
email_objs = [Email(m, ledger) for m in tqdm(mails)]
fltrd_emails = [m for m in email_objs if m.has_header]

with open("emails.pkl", "wb") as handle:
    pickle.dump(fltrd_emails, handle)

 83%|████████▎ | 134636/161645 [01:48<00:20, 1294.73it/s]

Timezone outside of 24 hours:  -2 days, 17:00:00


 98%|█████████▊| 157693/161645 [02:06<00:02, 1362.07it/s]

Timezone outside of 24 hours:  -2 days, 20:00:00


100%|██████████| 161645/161645 [02:09<00:00, 1248.65it/s]


In [7]:
load = False
if load:
    with open("emails.pkl", "rb") as handle:
        fltrd_emails = pickle.load(handle)

mails_srtd = sorted(fltrd_emails, key=lambda m: m.sent)
convos = Conversation.conversations_from_sorted_emails(mails_srtd, ledger)

with open("conversations.pkl", "wb") as handle:
    pickle.dump(convos, handle)

---
## Collect relation instances from Conversation objects

In [10]:
conv_dict = {hash(c): c for c in convos}

i = 31347

# EvidencedBy(Conv, mail_1, ..., mail_n) => represented as EvidencedBy(Conv, [mail_1, ..., mail_n])
evidencedBy = [(hash(c), c.emails) for c in convos]
print(evidencedBy[i], "\n")

# StartsAt(Conv, time)-
startsAt = [(hash(c), c.start_time) for c in convos]
print(startsAt[i], "\n")

# EndsAt(Conv, time)
endsAt = [(hash(c), c.end_time) for c in convos]
print(endsAt[i], "\n")

# Interlocutor(Conv, Person) => use set to get rid of duplicates
hasInterlocutor = [(hash(c), p) for c in convos for p in (c.interlocutors or [[]])]
print(hasInterlocutor[i], "\n")


# RefersTo(Conv, doc) where doc is Link or Address
refersTo = [(hash(c), d) for c in convos for d in (c.mentioned_links+c.mentioned_addresses or [[]])]
print(refersTo[i], "\n")


# BelongsTo(Person, Org)
belongsTo = {p: p.org for c in convos for p in c.interlocutors if p.org}

(-7941893237740906599, ('AMEPKEBLDJJCCDEJHAMIAEJMCHAA.ejw@cse.ucsc.edu',)) 

(-7941893237740906599, datetime.datetime(2001, 1, 18, 11, 2, 1, tzinfo=tzoffset(None, -28800))) 

(-7941893237740906599, datetime.datetime(2001, 1, 18, 11, 2, 1, tzinfo=tzoffset(None, -28800))) 

(2522554939845907609, WAI Interest Group <w3c-wai-ig@w3.org>) 

(1970517829089622015, <process_emails_provenance.Address object at 0x7f616d24ca90>) 



In [11]:
len(convos), len(startsAt), len(hasInterlocutor), len(refersTo), len(fltrd_emails), len(belongsTo),\
len(set({p for c in convos for p in c.interlocutors})),\
len(set({p.org for c in convos for p in c.interlocutors if p.org}))

(90605, 90605, 287926, 462011, 143963, 28041, 28097, 11847)

In [12]:
print("--- Example Conversation ---\n\n")

c = convos[737]

print("Interlocutors:")
for p in set(c.interlocutors):
    print("\t", p, "\t\t\t belongs to org: ", belongsTo[p])
    
print("\n__________________")
print("Docs mentioned:")

for l in set(c.mentioned_links):
    print(l.url, "\t\t domain: ", l.domain, "\t path: ", l.path)
    
for a in set(c.mentioned_addresses):
    print(a.address, "\t part of org: ", a.org) 

--- Example Conversation ---


Interlocutors:
	 Tim Bray <tbray@textuality.com> 			 belongs to org:  textuality.com
	 Rick Jelliffe <ricko@allette.com.au> 			 belongs to org:  allette.com.au
	 Gavin Nicol <gtn@ebt.com> 			 belongs to org:  ebt.com
	  <w3c-sgml-wg@w3.org> 			 belongs to org:  w3.org
	  <ricko@allette.com.au> 			 belongs to org:  allette.com.au

__________________
Docs mentioned:
http://www.textuality.com/ 		 domain:  www.textuality.com 	 path:  /
http://www.allette.com.au 		 domain:  www.allette.com.au 	 path:  
http://www.allette.com.au 		 domain:  www.allette.com.au 	 path:  
http://www.allette.com.au 		 domain:  www.allette.com.au 	 path:  
http://www.allette.com.au 		 domain:  www.allette.com.au 	 path:  
http://www.allette.com.au/allette/ricko 		 domain:  www.allette.com.au 	 path:  /allette/ricko
http://www.allette.com.au 		 domain:  www.allette.com.au 	 path:  
http://www.allette.com.au/allette/ricko 		 domain:  www.allette.com.au 	 path:  /allette/ricko
http://w

In [13]:
len(ledger)

609389