In [12]:
import os
import pickle
from tqdm import tqdm
import numpy as np
import numpy.random as rand

from collections import Counter
import matplotlib.pyplot as plt


import snorkel
from snorkel.augmentation import transformation_function
from snorkel.augmentation import RandomPolicy, MeanFieldPolicy
from snorkel.augmentation import TFApplier, PandasTFApplier

In [2]:
os.chdir("data")

# Preparation
---
## Load E-Mails & Converted E-Mails

In [3]:
# "../../../w3c-emails/emails.pkl"
with open("1000_emails.pkl", "rb") as handle:
    emails = [e for e in pickle.load(handle)]

senders = [e.sender for e in emails]
rng = list(range(len(emails)))

In [4]:
# "emails_token_ids.pkl"
with open("1000_emails_token_ids.pkl", "rb") as handle:
    email_ids = pickle.load(handle)

## Group E-Mails by Sender

In [5]:
@transformation_function()
def positive_example(e):
    e_id, auth_label = x
    e_id, auth_label = e.ind, 
    auth_mails = by_author[auth_label]
    return (e_id, rand.choice(by_author[auth_label])), 1


@transformation_function()
def negative_example(x):
    e_id, auth_label = x
    other_label = rand.choice(list(by_author.keys() - {auth_label}))
    return (e_id, rand.choice(by_author[other_label])), 0

tfs = [positive_example, negative_example]

In [6]:
import pandas as pd

In [7]:
mails_d = [{"id": e.id, "author": e.sender, "body": e.body_raw} for e in emails]

mails_d = {"index": rng, "id": [e.id for e in emails], 
           "author": [e.sender for e in emails], "body": [e.body_raw for e in emails]}

In [8]:
mail_frame = pd.DataFrame.from_dict(mails_d)

In [9]:
mail_frame.head()

Unnamed: 0,index,id,author,body
0,0,000401bed467$49c94600$397138d4@omnibook1,jonathan chetwynd <jonathan@signbrowser.free-o...,\n\n\nI completely agree with your statement 1...
1,1,Pine.OSF.3.96.980222133016.7770B-100000@a5.ph....,Alan J. Flavell <flavell@a5.ph.gla.ac.uk>,\n\n\n\n[N.B This message is going to the WAI-...
2,2,613B3C619C9AD4118C4E00B0D03E7C3E3CAE3C@exchang...,Kevin Regan <kevinr@valicert.com>,\n\n\n\nI've been a way on other activities fo...
3,3,200105261428.f4QESdr24274@sophia.inria.fr,Roger McKenssy <casting@altavista.com>,\n\n\nWe are looking for new faces for TV & Mo...
4,4,3.0.5.32.19981007111638.009beb10@pophost.arbor...,Mike Champion <mcc@arbortext.com>,"\n\n\nAt 02:05 PM 10/7/98 +0100, \""Pasqualino ..."


In [11]:
@transformation_function()
def positive_example_pd(row):
    e_id, auth_label = row.index, row.author
    return (e_id, rand.choice(by_author[auth_label])), (auth_label, auth_label)

@transformation_function()
def negative_example_pd(x):
    e_id, auth_label = row.index, row.author
    other_label = rand.choice(list(by_author.keys() - {auth_label}))
    return (e_id, rand.choice(by_author[other_label])), (auth_label, other_label)

tfs = [positive_example, negative_example]

(0, index                                                     0
 id                 000401bed467$49c94600$397138d4@omnibook1
 author    jonathan chetwynd <jonathan@signbrowser.free-o...
 body      \n\n\nI completely agree with your statement 1...
 Name: 0, dtype: object)

In [None]:
mf_policy = MeanFieldPolicy(
        len(tfs), sequence_length=1, p=[0.3, 0.7], n_per_original=2, keep_original=False
)

tf_applier = PandasTFApplier(tfs, mf_policy)
samples = tf_applier.apply(mail_frame)