 ### Classes `Email` and `Conversation`
 
  - outsourced all pre-processing, parsing and structuring functionality to `process_emails.py`
  - `Email` (obviously) represents an e-mail, currently the fields of the header is parsed (just like in `parse_headers.ipynb?`) and the e-mail body is used as the raw text
  
  - `Conversation` takes a list of `Email` instances (it can also construct the list of e-mails which form a conversation from the data set) and from them extracts and structures lists of itnerlocutors, mentioned docs, etc
  

In [1]:
import os
import pickle
from tqdm import tqdm

import re
import html

# from collections import Counter
# import matplotlib.pyplot as plt

from process_emails import Email, Conversation

In [2]:
mails = []
for f in tqdm(os.listdir("files")):
    with open("files/"+f) as handle:
        mails.append(list(map(html.unescape, handle.readlines())))

100%|██████████| 161645/161645 [00:12<00:00, 12660.13it/s]


In [3]:
email_objs = [Email(m) for m in tqdm(mails)]
fltrd_emails = [m for m in email_objs if m.has_header]

with open("emails.pkl", "wb") as handle:
    pickle.dump(fltrd_emails, handle)

 83%|████████▎ | 134595/161645 [01:15<00:17, 1563.16it/s]

Timezone outside of 24 hours:  -2 days, 17:00:00


 98%|█████████▊| 157777/161645 [01:27<00:01, 1981.14it/s]

Timezone outside of 24 hours:  -2 days, 20:00:00


100%|██████████| 161645/161645 [01:29<00:00, 1812.09it/s]


In [4]:
mails_srtd = sorted(fltrd_emails, key=lambda m: m.sent)
convos = Conversation.conversations_from_sorted_emails(mails_srtd)

---
## Collect relation instances from Conversation objects

In [16]:
conv_dict = {hash(c): c for c in convos}

# EvidencedBy(Conv, mail_1, ..., mail_n) => represented as EvidencedBy(Conv, [mail_1, ..., mail_n])
evidencedBy = [(hash(c), c.emails) for c in convos]
print(evidencedBy[0], "\n")

# StartsAt(Conv, time)
startsAt = [(hash(c), c.start_time) for c in convos]
print(startsAt[0], "\n")

# EndsAt(Conv, time)
endsAt = [(hash(c), c.end_time) for c in convos]
print(endsAt[0], "\n")

# Interlocutor(Conv, Person) => use set to get rid of duplicates
hasInterlocutor = [(hash(c), p) for c in convos for p in c.interlocutors]
print(hasInterlocutor[0], "\n")


# RefersTo(Conv, doc) where doc is Link or Address
refersTo = [(hash(c), d) for c in convos for d in c.mentioned_links+c.mentioned_addresses]
print(refersTo[0], "\n")


# BelongsTo(Person, Org)
belongsTo = {p: p.org for c in convos for p in c.interlocutors if p.org}

(4885062280232913288, ('F0B6E827-1DD2-11B2-B516-000393556882@ontologicon.com',)) 

(4885062280232913288, datetime.datetime(1970, 1, 1, 2, 6, 11, tzinfo=tzoffset(None, 7200))) 

(4885062280232913288, datetime.datetime(1970, 1, 1, 2, 6, 11, tzinfo=tzoffset(None, 7200))) 

(4885062280232913288, Dimitris Dimitriadis <dimitris@ontologicon.com>) 

(-2500555230501122677, <process_emails.Link object at 0x7f180ce37f28>) 



In [18]:
print("--- Example Conversation ---\n\n")

c = convos[737]

print("Interlocutors:")
for p in c.interlocutors:
    print("\t", p, "\t\t\t belongs to org: ", belongsTo[p])
    
print("\n__________________")
print("Docs mentioned:")

for l in c.mentioned_links:
    print(l.url, "\t\t domain: ", l.domain, "\t path: ", l.path)
    
for a in c.mentioned_addresses:
    print(a.address, "\t part of org: ", a.org) 

--- Example Conversation ---


Interlocutors:
	 Tim Bray <tbray@textuality.com> 			 belongs to org:  textuality.com
	 Rick Jelliffe <ricko@allette.com.au> 			 belongs to org:  allette.com.au
	 Gavin Nicol <gtn@ebt.com> 			 belongs to org:  ebt.com
	 Rick Jelliffe <ricko@allette.com.au> 			 belongs to org:  allette.com.au
	 Rick Jelliffe <ricko@allette.com.au> 			 belongs to org:  allette.com.au
	 Rick Jelliffe <ricko@allette.com.au> 			 belongs to org:  allette.com.au
	 Gavin Nicol <gtn@ebt.com> 			 belongs to org:  ebt.com
	 Gavin Nicol <gtn@ebt.com> 			 belongs to org:  ebt.com
	 Gavin Nicol <gtn@ebt.com> 			 belongs to org:  ebt.com
	 Rick Jelliffe <ricko@allette.com.au> 			 belongs to org:  allette.com.au
	 Gavin Nicol <gtn@ebt.com> 			 belongs to org:  ebt.com
	  <w3c-sgml-wg@w3.org> 			 belongs to org:  w3.org
	 Tim Bray <tbray@textuality.com> 			 belongs to org:  textuality.com
	  <ricko@allette.com.au> 			 belongs to org:  allette.com.au
	 Gavin Nicol <gtn@ebt.com> 			 belongs 