In [1]:
import os
import pickle
from tqdm import tqdm
import numpy as np
import numpy.random as rand

from collections import Counter
import matplotlib.pyplot as plt

import pandas as pd

import snorkel
from snorkel.augmentation import transformation_function
from snorkel.augmentation import RandomPolicy, MeanFieldPolicy
from snorkel.augmentation import TFApplier, PandasTFApplier

from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier


In [2]:
os.chdir("data")

# Load Data

In [3]:
# "../../../w3c-emails/emails.pkl"
with open("1000_emails.pkl", "rb") as handle:
    emails = [e for e in pickle.load(handle)]

# rng = list(range(len(emails)))
for i, e in enumerate(emails):
    e.index = i

# senders = [e.sender for e in emails]


In [4]:
# "emails_token_ids.pkl"
with open("1000_emails_token_ids.pkl", "rb") as handle:
    email_ids = pickle.load(handle)

# Put into DF $\rightarrow$ AS COMBINATIONS

In [32]:
names = "index", "id", "sender", "body_raw"

# names2 = tuple(n + "_2" for n in names)

def by_author(emails):
    d = {}
    
    for e in emails:
        if not e.sender in d:
            d[e.sender] = []
        d[e.sender].append(e)
    return d

by_auths = by_author(emails)

        
def objs_to_frame(emails, min_size=1):
    rows = []
    for s, e_ls in by_auths.items():
        if len(e_ls) < min_size:
            continue
        e_ls2 = rand.permutation(e_ls)
        for e1, e2 in zip(e_ls, e_ls2):
            row = {n: str(getattr(e1, n)) for n in names}
            row.update({n + "_2": str(getattr(e2, n)) for n in names})
            rows.append(row)
    return pd.DataFrame(rows)

mail_frame = objs_to_frame(emails, min_size=10)#skip_singletons=True)

# Define Transformation Functions

In [33]:
@transformation_function()
def do_nothing(row):
    return row

@transformation_function()
def do_noise(row):
    e_id, auth_label = row["index"], row["sender"]
    other_label = rand.choice(list(by_auths.keys() - {auth_label}))
    other_email = rand.choice(by_auths[other_label])
    
    new_row = row.copy()
    for n in names:
        new_row[n+"_2"] = getattr(other_email, n)

    return new_row

tfs = [do_nothing, do_noise]

In [53]:
mf_policy = MeanFieldPolicy(
        len(tfs), sequence_length=1, p=[0.3, 0.7], n_per_original=4, keep_original=False
)

tf_applier = PandasTFApplier(tfs, mf_policy)
samples = tf_applier.apply(mail_frame)

100%|██████████| 94/94 [00:00<00:00, 131.11it/s]


In [54]:
samples[samples.sender == samples.sender_2].shape

(109, 8)

# Label Rows

In [44]:
@labeling_function()
def label(row):
    return row.sender == row.sender_2

In [55]:
lf_applier = PandasLFApplier([label])
labelled = lf_applier.apply(samples)

samples["is_same"] = labelled

100%|██████████| 376/376 [00:00<00:00, 10122.20it/s]


In [56]:
samples[samples["id"] == samples["id_2"]].shape[0]/samples.shape[0]

0.018617021276595744

In [57]:
1 - samples.drop_duplicates().shape[0]/samples.shape[0]

0.1063829787234043

In [62]:
samples[["index", "index_2", "is_same"]]

Unnamed: 0,index,index_2,is_same
0,16,283,0
0,16,867,0
0,16,219,0
0,16,933,0
1,198,998,0
...,...,...,...
92,787,684,0
93,834,787,1
93,834,432,0
93,834,860,0


In [59]:
sample_matrix = samples[["index", "index_2", "is_same"]].to_numpy().astype("int").tolist()

In [61]:
[(i1, i2) for i1, i2, _ in sample_matrix]

[(16, 283),
 (16, 867),
 (16, 219),
 (16, 933),
 (198, 998),
 (198, 589),
 (198, 997),
 (198, 997),
 (299, 768),
 (299, 620),
 (299, 768),
 (299, 309),
 (337, 886),
 (337, 299),
 (337, 411),
 (337, 299),
 (370, 27),
 (370, 655),
 (370, 122),
 (370, 268),
 (429, 16),
 (429, 656),
 (429, 327),
 (429, 16),
 (461, 257),
 (461, 718),
 (461, 767),
 (461, 807),
 (768, 370),
 (768, 686),
 (768, 633),
 (768, 370),
 (844, 708),
 (844, 15),
 (844, 899),
 (844, 205),
 (899, 337),
 (899, 229),
 (899, 149),
 (899, 896),
 (997, 344),
 (997, 844),
 (997, 844),
 (997, 988),
 (20, 886),
 (20, 613),
 (20, 613),
 (20, 100),
 (36, 692),
 (36, 692),
 (36, 722),
 (36, 692),
 (205, 109),
 (205, 358),
 (205, 100),
 (205, 498),
 (261, 288),
 (261, 36),
 (261, 547),
 (261, 791),
 (278, 670),
 (278, 296),
 (278, 408),
 (278, 707),
 (296, 617),
 (296, 617),
 (296, 266),
 (296, 491),
 (306, 811),
 (306, 811),
 (306, 427),
 (306, 168),
 (317, 205),
 (317, 888),
 (317, 953),
 (317, 205),
 (358, 3),
 (358, 726),
 (358

---

In [203]:
mail_frame.sort_values(by="sender")[mail_frame.sender == "Charles McCathieNevile <charles@w3.org>"][["index", "index_2","body_raw", "body_raw_2"]]

  """Entry point for launching an IPython kernel.


Unnamed: 0,index,index_2,body_raw,body_raw_2
48,317,626,\n\n\nForwarded from GL. PLease carry the disc...,\n\n\nAargh! it is not even readable - tries t...
47,306,617,\n\n\n\nI agree that browsers should be able t...,\n\n\n\n(My point here is trivial and adds not...
46,296,581,"\n\n\n\nSorry,\n\nI was trying to explain why ...",\n\n\nIt should create it as part odf a create...
57,692,317,\n\n\nHow about allow the user to request...\n...,\n\n\nForwarded from GL. PLease carry the disc...
56,649,205,\n\n\nWe have the bridge for meetings at 3pm U...,"\n\n\nYes, I think this claim is grossly over-..."
44,261,769,\n\n\nThere is an example of absolute position...,\n\n\nEspecially this bit - a good approach.\n...
43,205,296,"\n\n\nYes, I think this claim is grossly over-...","\n\n\n\nSorry,\n\nI was trying to explain why ..."
42,36,586,"\n\n\nIn terms of rendering, yes - it should j...",\n\n\nThe Guideline 4.4 - Ensure that conversi...
45,278,613,"\n\n\nWell, SVG does provide a solution, and i...","\n\n\n\nHi,\n\nis it possible to set a prefere..."
49,358,539,"\n\n\nYes, I agree that correct usage is impor...","\n\n\nWell, alt text is required by HTML, and ..."


# Create Combinations

In [153]:
by_author = mail_frame.groupby("sender")
by_author_filtered = by_author.filter(func=lambda f:(f.shape[0] > 1))
by_author2 = mail_frame.add_suffix("_2").groupby("sender_2")
by_author2 = by_author2.apply(lambda small_frame: small_frame.sample(frac=1))

In [126]:
by_author.head()

Unnamed: 0,id,sender,body_raw
0,000401bed467$49c94600$397138d4@omnibook1,jonathan chetwynd <jonathan@signbrowser.free-o...,\n\n\nI completely agree with your statement 1...
1,Pine.OSF.3.96.980222133016.7770B-100000@a5.ph....,Alan J. Flavell <flavell@a5.ph.gla.ac.uk>,\n\n\n\n[N.B This message is going to the WAI-...
2,613B3C619C9AD4118C4E00B0D03E7C3E3CAE3C@exchang...,Kevin Regan <kevinr@valicert.com>,\n\n\n\nI've been a way on other activities fo...
3,200105261428.f4QESdr24274@sophia.inria.fr,Roger McKenssy <casting@altavista.com>,\n\n\nWe are looking for new faces for TV & Mo...
4,3.0.5.32.19981007111638.009beb10@pophost.arbor...,Mike Champion <mcc@arbortext.com>,"\n\n\nAt 02:05 PM 10/7/98 +0100, \""Pasqualino ..."
...,...,...,...
994,80256A0D.003D2B91.00@d06mta07.portsmouth.uk.ib...,Tim_Ellison@uk.ibm.com <Tim_Ellison@uk.ibm.com>,\n\n\n\n\n> I just noticed (in draft 14) that ...
995,Pine.SOL.3.95q.990210113112.17932A-100000@red....,E.E. Mellor <eem21@cam.ac.uk>,"\n\n\nOn Wed, 10 Feb 1999 Irene.Vatton@inrialp..."
996,200205140329.XAA14254@www19.w3.org,xmailer@yahoo.com <xmailer@yahoo.com>,\n\n\n\n????????: http://www.vipmail.net.cn \n...
998,200204041156.GAA13153@www19.w3.org,bwfw@hotmail.com <bwfw@hotmail.com>,\n\n\n???????????????????? \n\n ???????????...


In [125]:
by_author2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,id_2,sender_2,body_raw_2
sender_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
. <smil-editors@w3.org>,711,200211140355.WAA18236@tux.w3.org,. <smil-editors@w3.org>,\n\n\n\n\n\ntext/html attachment: stored\n\n\n...
0 <??????@w3.org>,140,BE20020409124040390_10000024_,0 <??????@w3.org>,\n\n\n????????????????????????????????????????...
000@www.111.com <000@www.111.com>,819,200210251133.HAA31792@tux.w3.org,000@www.111.com <000@www.111.com>,\n\n\n\n??????????????????????????1????0.6????...
234345@ayi.com <234345@ayi.com>,967,200211260006.TAA06279@tux.w3.org,234345@ayi.com <234345@ayi.com>,"\n\n\n\n????????,?????????????????????????????..."
7788998@heinfo.net <7788998@heinfo.net>,237,200209111003.GAA15640@tux.w3.org,7788998@heinfo.net <7788998@heinfo.net>,\n\n\n\n???? ????????????????????????http://zz...


In [130]:
type(by_author), type(by_author2)

(pandas.core.groupby.generic.DataFrameGroupBy, pandas.core.frame.DataFrame)

In [138]:
mail_frame.head()

Unnamed: 0,id,sender,body_raw
0,000401bed467$49c94600$397138d4@omnibook1,jonathan chetwynd <jonathan@signbrowser.free-o...,\n\n\nI completely agree with your statement 1...
1,Pine.OSF.3.96.980222133016.7770B-100000@a5.ph....,Alan J. Flavell <flavell@a5.ph.gla.ac.uk>,\n\n\n\n[N.B This message is going to the WAI-...
2,613B3C619C9AD4118C4E00B0D03E7C3E3CAE3C@exchang...,Kevin Regan <kevinr@valicert.com>,\n\n\n\nI've been a way on other activities fo...
3,200105261428.f4QESdr24274@sophia.inria.fr,Roger McKenssy <casting@altavista.com>,\n\n\nWe are looking for new faces for TV & Mo...
4,3.0.5.32.19981007111638.009beb10@pophost.arbor...,Mike Champion <mcc@arbortext.com>,"\n\n\nAt 02:05 PM 10/7/98 +0100, \""Pasqualino ..."


In [155]:
by_author.get_group("Charles McCathieNevile <charles@w3.org>")

Unnamed: 0,id,sender,body_raw
20,Pine.LNX.4.30.0301060744540.26113-100000@tux.w...,Charles McCathieNevile <charles@w3.org>,"\n\n\n\nOn Mon, 6 Jan 2003, Irene Vatton wrote..."
36,Pine.LNX.4.10.9910030022350.22149-100000@tux.w...,Charles McCathieNevile <charles@w3.org>,"\n\n\nIn terms of rendering, yes - it should j..."
205,Pine.LNX.4.30.0109240230340.18735-100000@tux.w...,Charles McCathieNevile <charles@w3.org>,"\n\n\nYes, I think this claim is grossly over-..."
261,Pine.LNX.4.21.0009140856420.17125-100000@tux.w...,Charles McCathieNevile <charles@w3.org>,\n\n\nThere is an example of absolute position...
278,Pine.LNX.4.21.0009280542320.23123-100000@tux.w...,Charles McCathieNevile <charles@w3.org>,"\n\n\nWell, SVG does provide a solution, and i..."
296,Pine.LNX.4.30.0207072024420.2833-100000@tux.w3...,Charles McCathieNevile <charles@w3.org>,"\n\n\n\nSorry,\n\nI was trying to explain why ..."
306,Pine.LNX.4.55.0308231024460.6753@homer.w3.org,Charles McCathieNevile <charles@w3.org>,\n\n\n\nI agree that browsers should be able t...
317,Pine.LNX.4.20.0003160926520.6999-100000@tux.w3...,Charles McCathieNevile <charles@w3.org>,\n\n\nForwarded from GL. PLease carry the disc...
358,Pine.LNX.4.30.0111251932140.4222-100000@tux.w3...,Charles McCathieNevile <charles@w3.org>,"\n\n\nYes, I agree that correct usage is impor..."
539,Pine.LNX.4.30.0112061350590.16737-100000@tux.w...,Charles McCathieNevile <charles@w3.org>,"\n\n\nWell, alt text is required by HTML, and ..."


In [154]:
by_author2.get_group("Charles McCathieNevile <charles@w3.org>")

AttributeError: 'DataFrame' object has no attribute 'get_group'

In [142]:
by_author.apply(lambda f: f.shape[0]).sort_values()

sender
. <smil-editors@w3.org>                           1
Robert Neff <rcn@fenix2.dol-esa.gov>              1
Robert Hart <ab6rah@bath.ac.uk>                   1
Robert B. Yonaitis <ryonaitis@hisoftware.com>     1
Rob McCool <robm@robm.com>                        1
                                                 ..
Brian McBride <bwm@hplb.hpl.hp.com>              10
Clemm, Geoff <gclemm@rational.com>               11
CVS Update Notifier <nobody@w3.org>              11
Wendy A Chisholm <wendy@w3.org>                  14
Charles McCathieNevile <charles@w3.org>          28
Length: 614, dtype: int64

In [176]:
by_author_filtered

Unnamed: 0,id,sender,body_raw
2,613B3C619C9AD4118C4E00B0D03E7C3E3CAE3C@exchang...,Kevin Regan <kevinr@valicert.com>,\n\n\n\nI've been a way on other activities fo...
5,Pine.GSO.4.44.0302051145110.19532-100000@mail....,Jan Grant <Jan.Grant@bristol.ac.uk>,"\n\n\n\nOn Wed, 5 Feb 2003, Brian McBride wrot..."
6,NDBBIKLAGLCOPGKGADOJEEDICOAA.ejw@ics.uci.edu,Jim Whitehead <ejw@ics.uci.edu>,"\n\n\nMinutes from the February 16, 2000 bindi..."
8,27FF4FAEA8CDD211B97E00902745CBE2015B7AE7@seine...,Kevin Regan <kevinr@valicert.com>,\n\n\nThank you both for your responses.\n\n--...
10,34C79A46.A2CE9C3D@w3.org,Ian B. Jacobs <ij@w3.org>,\n\n\nChuck Letourneau wrote:\n> \n> Hello all...
...,...,...,...
975,3A007422.CAF2AB0D@w3.org,Philippe Le Hegaret <plh@w3.org>,\n\n\nJamshed Ahsan wrote:\n> \n> I do not und...
979,199809021957.MAA24807@netcom.com,Scott Luebking <phoenixl@netcom.com>,"\n\n\nHi,\nMaybe Raman could do a demo for 15 ..."
980,4.1.19990310131553.0094eda0@host.igs.net,Chuck Letourneau <cpl@starlingweb.com>,\n\n\nPlease note that we have a conference ca...
985,4.3.2.7.2.20000918173509.00d6e100@pop3.concent...,Gregory J. Rosmaita <unagi69@concentric.net>,\n\n\nin a recent post to the User Agent Acces...
