In [1]:
import pickle
import os
# from datetime import datetime
from tqdm import tqdm
import re
import numpy as np
# from collections import Counter

# import html
# import email.utils as mailutil

# from dateutil.parser import parse as du_parse
# import datetime
# from datetime import timedelta

In [2]:
with open("headers_processed.pkl", "rb") as handle:
    headers_srtd, mails_srtd = pickle.load(handle)

## Sort emails into conversation structure

 - first: sort by time sent (field `sent`)
 - second: group into conversations by doing for each email:
   1. if email has `inreplyto=None`, start new conversation
   2. else look up email with ID corresponding to `inreplyto` value
     - if there is no other email with the `id` that was given by `inreplyto`, also start a new conversation
     
   -> each separate list of emails is a 'conversation', i.e. all in reply to each other and in correct temporal order
   

 - sorting by time sent seems to work, original corpus was randomly ordered
 - grouping into conversations by `inreplyto` does not work well, as extracted conversations <br>
   turn out to be very short (overwhelming majority consists of a single email) <br>
   -> refrain from this for now, later steps are only sorted according to time sent 

In [3]:
convos = []
ids = {}

i = 0
id_not_found = []            
for h in tqdm(headers_srtd):
    if h["inreplyto"] is None or h["inreplyto"] not in ids:
        convos.append([h])
        ids[h["id"]] = i # need to check if ID actually exists! -> runs fine without (weird?)
        i += 1
        
        if h["inreplyto"] not in ids:
            id_not_found.append(h)
    else:
        convos[ids[h["inreplyto"]]].append(h)
        ids[h["id"]] = ids[h["inreplyto"]] # need to check if ID actually exists!        

100%|██████████| 143963/143963 [00:00<00:00, 379557.64it/s]


In [30]:
real_convos = [c for c in convos if len(c) > 1]
with open("real_conversations.pkl", "wb") as handle:
    pickle.dump(real_convos, handle)

In [4]:
print("number of emails: ", len(headers_srtd))
print("number of extracted conversations: ", len(convos))

lens = list(map(len, convos))
print("mean, median and standard dev. of conversation length (in number of emails)\n", 
      round(np.mean(lens), 3), np.median(lens), round(np.var(lens)**.5, 3))

import matplotlib.pyplot as plt
plt.hist([l for l in lens if l < 20], bins=20)
plt.title("'Conversation' length histogram")
plt.xlabel("conversation length in number of emails")
plt.ylabel("number of conversations")
plt.show()

number of emails:  143963
number of extracted conversations:  90605
mean, median and standard dev. of conversation length (in number of emails)
 1.589 1.0 1.9


<Figure size 640x480 with 1 Axes>

---

## Using Snorkel to learn the function which sort e-mails into conversations

In [37]:
from snorkel.labeling import labeling_function
from snorkel.labeling import LFApplier
from snorkel.labeling import LFAnalysis


CONV = 1
NOT_CONV = 0
ABSTAIN = -1


In [9]:
@labeling_function()
def sent_earlier_than(mail_tuple):
    mail1, mail2 = mail_tuple
    return ABSTAIN if mail1["sent"] < mail2["sent"] else NOT_CONV

@labeling_function()
def is_inreplyto(mail_tuple):
    mail1, mail2 = mail_tuple
    return CONV if mail1["inreplyto"] == mail2["id"] else ABSTAIN

applier = LFApplier([sent_earlier_than, is_inreplyto])

In [32]:
import numpy.random as rand

real_tups = [(c[i], c[i+1]) for c in real_convos for i in range(len(c)-1)]
rand_tups = [(h1, h2) for h1, h2 in zip(rand.choice(headers_srtd, 50000), rand.choice(headers_srtd, 50000))]
print(len(real_tups))

labels = applier.apply(real_tups + rand_tups)

13023it [00:00, 130223.60it/s]

53358


103358it [00:00, 111538.84it/s]


In [36]:
coverage_sent_earlier_than, coverage_is_inreplyto = (labels != ABSTAIN).mean(axis=0)
print(f"sent_earlier_than coverage: {coverage_sent_earlier_than * 100:.1f}%")
print(f"is_inreplyto coverage: {coverage_is_inreplyto * 100:.1f}%")


sent_earlier_than coverage: 25.1%
is_inreplyto coverage: 0.0%


In [38]:
LFAnalysis(L=labels, lfs=[sent_earlier_than, is_inreplyto]).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
sent_earlier_than,0,[0],0.250789,0.0,0.0
is_inreplyto,1,[],0.0,0.0,0.0
