In [1]:
import snorkel
from snorkel.labeling import labeling_function
from snorkel.augmentation import transformation_function

import numpy as np
import numpy.random as rand

$\forall$ author label $X:$ let $E_X = \{e_1, ..., e_k\}$ s.t. $\forall e \in E_X: author(e) = X$

$\exists$ person $\gamma$ s.t. $X$ is a label for $\gamma$ and $Y$ is a label for $\gamma$ (i.e. $X = Y$)

# Samples

 - any sample is $((e_1, e_2), b)$ with $e_1, e_2 \in E$ and $b \in \{0, 1\}$ and $b = 1$ iff $\exists E_X$ s.t. $e_1, e_2 \in E_X$ ($E_X$ may not be observed or $e_1, e_2 \in E_X$ may not be observed)

## positive samples $(b=1)$

 - number: $\sum_X {|E_X| \choose 2}$
 - trivial positive sample $\forall e \in E: ((e, e), 1)$
 - for each $X$, randomly draw $k$ tuples $(e_1, e_2)$ with $e_1, e_2 \in E_X$ and $k$ proportional to $|E_X|$
 
 
## negative samples $(b=0)$

 - randomly sample $X$ and $Y$, then randomly sample $e_X$ from $E_X$ and $e_Y$ from $E_Y$ to form sample $((e_X, e_Y), 0)$
 - with overwhelming probability, $X \not= Y$, so that the created sample is most likely truly a negative one

In [2]:
corpus = {"a": [1,2,3,4],
          "b": [5,6,7],
          "c": [8,9]}
emails = list(corpus.values())
emails2 = [(e, l) for l, e_ls in corpus.items() for e in e_ls]

In [40]:
@transformation_function()
def positive_example(x):
    e_id, auth_label = x
    auth_mails = corpus[auth_label]
    return (e_id, rand.choice(corpus[auth_label])), 1


@transformation_function()
def negative_example(x):
    e_id, auth_label = x
    other_label = rand.choice(list(corpus.keys() - {auth_label}))
    return (e_id, rand.choice(corpus[other_label])), 0

In [41]:
tfs = [
    positive_example,
    negative_example
]

In [42]:
from snorkel.augmentation import RandomPolicy, MeanFieldPolicy
from snorkel.augmentation import TFApplier

random_policy = RandomPolicy(
        len(tfs), sequence_length=1, n_per_original=2, keep_original=False
)

mf_policy = MeanFieldPolicy(
        len(tfs), sequence_length=1, p=[0.1, 0.9], n_per_original=2, keep_original=False
)

tf_applier = TFApplier(tfs, mf_policy)
samples = tf_applier.apply(emails2)

# Create real data (small)

In [11]:
import pickle
import numpy as np

In [2]:
with open("first_twenty.pkl", "rb") as handle:
    first_twenty = pickle.load(handle)

In [6]:
for e in first_twenty:
    print(e.shape)

(172, 768)
(424, 768)
(173, 768)
(417, 768)
(496, 768)
(135, 768)
(63, 768)
(200, 768)
(219, 768)
(97, 768)
(212, 768)
(509, 768)
(575, 768)
(108, 768)
(632, 768)
(8, 768)
(954, 768)
(291, 768)
(1442, 768)
(8, 768)


In [35]:
l = max(map(lambda e: e.shape[0], first_twenty))


cut = np.asarray([e[:8, :] for e in first_twenty])

padded = np.asarray([np.concatenate((e, np.zeros((l-e.shape[0], 768))))
         for e in first_twenty])