In [None]:
import numpy as np
import pandas as pd

In [None]:
import random

# The mean of the positive example's distribution
MEAN_P = [3, 3]
# The covariance matrix of the positive example's distribution
COV_P = [
    [1, 0],
    [0, 1],
]

# The mean of the negative example's distribution
MEAN_N = [0, 0]
# The covariance matrix of the positive example's distribution
COV_N = [
    [1, 0],
    [0, 1],
]

# The class prior
ALPHA = 0.8


def get_samples(n: int):
    x, y = [], []
    for _ in range(n):
        if random.random() < ALPHA:
            x.append(np.random.multivariate_normal(MEAN_P, COV_P, 1))
            y.append(1)
        else:
            x.append(np.random.multivariate_normal(MEAN_N, COV_N, 1))
            y.append(0)
    return np.concatenate(x), np.asarray(y)

In [None]:
N_TRAIN = 10000  # The number of training examples
N_VALID = 1000  # The number of validation examples
N_TEST = 1000  # The number of test examples

train_xs, train_ys = get_samples(N_TRAIN)
valid_xs, valid_ys = get_samples(N_VALID)
test_xs, test_ys = get_samples(N_TEST)

In [None]:
from utils import plot_x_y

plot_x_y(train_xs, train_ys)

In [None]:
c = 0.1  # Label frequency


def propensity_score(ys):
    return [c] * len(ys)

In [None]:
train_es = propensity_score(train_ys)
valid_es = propensity_score(valid_ys)
test_es = propensity_score(test_ys)

In [None]:
def get_label(y, e):
    if y == 0:
        return 0
    else:
        return int(random.random() < e)

In [None]:
train_ss = [get_label(y, e) for y, e in zip(train_ys, train_es)]
valid_ss = [get_label(y, e) for y, e in zip(valid_ys, valid_es)]
test_ss = [get_label(y, e) for y, e in zip(test_ys, test_es)]

In [None]:
from utils import plot_x_s

plot_x_s(train_xs, train_ss)

In [None]:
def save_data(filename, xs, ys, ss, es):
    with open(filename, "wt") as f:
        f.write("x_0,x_1,y,s,e\n")
        for x, y, s, e in zip(xs, ys, ss, es):
            f.write(f"{x[0]},{x[1]},{y},{s},{e}\n")


def save_c(filename, c):
    with open(filename, "wt") as f:
        f.write(f"{c}")

In [None]:
data_id = "scar"

save_data(f"data/{data_id}/train.csv", train_xs, train_ys, train_ss, train_es)
save_data(f"data/{data_id}/valid.csv", valid_xs, valid_ys, valid_ss, valid_es)
save_data(f"data/{data_id}/test.csv", test_xs, test_ys, test_ss, test_es)

save_c(f"data/{data_id}/c.txt", c)

In [None]:
class LabelingMechanism:
    def __init__(
        self,
        propensity_attributes,
        propensity_attributes_signs,
        min_prob=0.0,
        max_prob=1.0,
        power=1,
    ):
        assert len(propensity_attributes) == len(
            propensity_attributes_signs
        ), "size of attributes and signs must be same"
        self.propensity_attributes = np.array(propensity_attributes)
        self.propensity_attributes_signs = np.array(propensity_attributes_signs)
        self.min_prob = min_prob
        self.max_prob = max_prob
        self.power = power

        self.min_x = None
        self.max_x = None

    def fit(self, xs):
        xs_ = xs[:, self.propensity_attributes] * self.propensity_attributes_signs
        self.min_x = xs_.min(0)
        self.max_x = xs_.max(0)

    def propensity_score(self, xs):
        assert (
            self.min_x is not None and self.max_x is not None
        ), "run fit() before calculating propensity score"
        xs_ = xs[:, self.propensity_attributes] * self.propensity_attributes_signs
        scaled = self.min_prob + (
            ((xs_ - self.min_x) / (self.max_x - self.min_x)) ** self.power
        ) * (self.max_prob - self.min_prob)
        es = (scaled ** (1 / len(self.propensity_attributes))).prod(1)
        return es

    @staticmethod
    def label_frequency(es, ys):
        es_pos = es[ys == 1]
        return es_pos.mean()

In [None]:
lm = LabelingMechanism([0], [1], min_prob=0.0, max_prob=1.0, power=4)
lm.fit(train_xs)

In [None]:
train_es = lm.propensity_score(train_xs)
valid_es = lm.propensity_score(valid_xs)
test_es = lm.propensity_score(test_xs)

c = lm.label_frequency(train_es, train_ys)

In [None]:
train_ss = [get_label(y, e) for y, e in zip(train_ys, train_es)]
valid_ss = [get_label(y, e) for y, e in zip(train_ys, valid_es)]
test_ss = [get_label(y, e) for y, e in zip(train_ys, test_es)]

In [None]:
plot_x_s(train_xs, train_ss)

In [None]:
data_id = "sar"

save_data(f"data/{data_id}/train.csv", train_xs, train_ys, train_ss, train_es)
save_data(f"data/{data_id}/valid.csv", valid_xs, valid_ys, valid_ss, valid_es)
save_data(f"data/{data_id}/test.csv", test_xs, test_ys, test_ss, test_es)

save_c(f"data/{data_id}/c.txt", c)

In [None]:
lm = LabelingMechanism([0, 1], [1, 1], min_prob=0.0, max_prob=1.0, power=4)
lm.fit(train_xs)

In [None]:
train_es = lm.propensity_score(train_xs)
valid_es = lm.propensity_score(valid_xs)
test_es = lm.propensity_score(test_xs)

c = lm.label_frequency(train_es, train_ys)

In [None]:
train_ss = [get_label(y, e) for y, e in zip(train_ys, train_es)]
valid_ss = [get_label(y, e) for y, e in zip(train_ys, valid_es)]
test_ss = [get_label(y, e) for y, e in zip(train_ys, test_es)]

In [None]:
plot_x_s(train_xs, train_ss)

In [None]:
data_id = "pg"

save_data(f"data/{data_id}/train.csv", train_xs, train_ys, train_ss, train_es)
save_data(f"data/{data_id}/valid.csv", valid_xs, valid_ys, valid_ss, valid_es)
save_data(f"data/{data_id}/test.csv", test_xs, test_ys, test_ss, test_es)

save_c(f"data/{data_id}/c.txt", c)