In [1]:
# basic (built-in) Python packages
import numpy as np
from numpy.linalg import norm
import pandas as pd
import matplotlib.pyplot as plt

# advanced (built-in) Python packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize

# my implemented Python functions and classes
from data.generate_data import generate_data
from model.MLE_python import MLE_python
from model.MLE import MLE
from model.OS import OS
from model.ORACLE import ORACLE
from model.Initial import Initial



# Hyperparameters

In [18]:
seed = 0
np.random.seed(seed=seed)

N = 100000        # the size of the unlabeled dataset
n = 10000            # pilot sample size
alpha = 1 # n**(-0.1)  
print(f"alpha={alpha:.4f}")

p = 10          # feature dimension
K = 2           # (K+1) classes
M = 500          # the size of the annotator pool
print(f"[n*alpha={int(n*alpha)}/N={N}]")
print(f"[M={M}] vs [n*alpha={int(n*alpha)}]")

alpha=1.0000
[n*alpha=10000/N=100000]
[M=500] vs [n*alpha=10000]


# Data Generation

In [19]:
beta, sigma_list, theta, X, Y, X1, X2, Y1, Y2, A1, AY1 = generate_data(K, p, N, n, M, alpha, seed=0)

True Labels 1    39315
0    33292
2    27393
dtype: int64 



In [20]:
theta

array([ 0.02814371,  0.28414075,  0.14869406,  0.02377327,  0.08672346,
        0.06519439,  0.29191809, -0.0400845 ,  0.0611682 , -0.16687604,
       -0.49881156,  0.12770615,  0.16889639, -0.14500665,  0.44347213,
       -0.28415875,  0.00894045, -0.0365726 ,  0.29947945,  0.28708815,
        0.1       ,  0.106     ,  0.112     ,  0.118     ,  0.124     ,
        0.13      ,  0.136     ,  0.142     ,  0.148     ,  0.154     ,
        0.16      ,  0.166     ,  0.172     ,  0.178     ,  0.184     ,
        0.19      ,  0.196     ,  0.202     ,  0.208     ,  0.214     ,
        0.22      ,  0.226     ,  0.232     ,  0.238     ,  0.244     ,
        0.25      ,  0.256     ,  0.262     ,  0.268     ,  0.274     ,
        0.28      ,  0.286     ,  0.292     ,  0.298     ,  0.304     ,
        0.31      ,  0.316     ,  0.322     ,  0.328     ,  0.334     ,
        0.34      ,  0.346     ,  0.352     ,  0.358     ,  0.364     ,
        0.37      ,  0.376     ,  0.382     ,  0.388     ,  0.39

# Initial Estimator

In [21]:
init_b = np.zeros((M, K+1, p))
init_beta = np.zeros((K+1, p))
init_sigma = np.zeros(M)

for m in range(M):
    y_m = AY1[:, m]
    idx = (A1[:, m] != 0)
    X_m = X1[idx]
    y_m = y_m[idx]
    clf = LogisticRegression(random_state=0, fit_intercept=False).fit(X_m, y_m)
    init_b[m] = clf.coef_
    init_b[m] -= init_b[m, 0]
    init_sigma[m] = 1 / norm(init_b[m])
    init_beta += init_b[m] * init_sigma[m] / M

init_beta = init_beta[1:]

- Raw:

In [22]:
norm(init_beta - beta[1:]), norm(init_sigma - sigma_list)

(0.008340973361465005, 1.8177646523736617)

- Rescale:

In [23]:
init_scale = norm(init_beta)
resc_init_beta = init_beta / init_scale
resc_init_sigma = init_sigma / init_scale
norm(resc_init_beta - beta[1:]), norm(resc_init_sigma - sigma_list)

(0.004951624991006946, 1.7941770251084135)

# One Step Update

In [24]:
os_model = OS(X1, AY1, A1, K, init_beta, init_sigma)
os_beta, os_sigma = os_model.one_step_update()

- Raw:

In [25]:
norm(os_beta - beta[1:].ravel()), norm(os_sigma - sigma_list)

(0.00663003041749186, 1.410350860645621)

- Rescale:

In [26]:
os_scale = norm(os_beta); print(os_scale)
os_beta /= os_scale
os_sigma /= os_scale
norm(os_beta - beta[1:].ravel()), norm(os_sigma - sigma_list)

0.9952609194000794


(0.0046476646040228805, 1.4017330207718566)

# Oracle

In [27]:
oracle_model = ORACLE(X1, AY1, A1, K, init_beta, sigma_list)
oracle_beta = oracle_model.NR_alg(max_steps=20, lbd=0.001)

[Step 1] beta difference norm:0.00589
[Step 2] beta difference norm:0.00004


- Raw:

In [28]:
norm(oracle_beta - beta[1:].ravel())

0.004229443672195607

# MLE

In [29]:
from model.MLE import MLE

mle_model = MLE(X1, AY1, A1, K, init_beta, init_sigma)
mle_model.NR_alg(max_steps=20, lbd=0.001)
mle_beta = mle_model.beta
mle_sigma = mle_model.sigma

[Step 1] theta difference norm:1.16345
[Step 2] theta difference norm:0.22375


- Raw:

In [30]:
norm(mle_beta.ravel() - beta[1:].ravel()), norm(mle_sigma - sigma_list)

(0.008513920647739232, 1.405484607678991)

- Rescale:

In [31]:
mle_scale = norm(mle_beta)
mle_beta /= mle_scale
mle_sigma /= mle_scale
norm(mle_beta.ravel() - beta[1:].ravel()), norm(mle_sigma - sigma_list)

(0.003953508566164229, 1.4437815206800964)

# Compare

In [32]:
B = 100
RMSE_results = []

In [None]:
for seed in range(B):
    RMSE_list = [seed, n, alpha, M, ]
    np.random.seed(seed)
    beta, sigma_list, theta, X, Y, X1, X2, Y1, Y2, A1, AY1 = generate_data(K, p, N, n, M, alpha, seed=seed)

    # Initial Estimator
    init_model = Initial(X1, AY1, A1, K)
    init_model.opt_alg()
    init_beta = init_model.initial_beta[1:]  # (K, p)
    init_sigma = init_model.initial_sigma    # (M,)
    init_beta_rmse = norm(init_beta.ravel() - beta[1:].ravel())
    init_sigma_rmse = norm(init_sigma - sigma_list)
    RMSE_list += [init_beta_rmse, init_sigma_rmse]
    
    # OS (One-Step) Estimator 
    # [1. Raw]
    os_model = OS(X1, AY1, A1, K, init_beta, init_sigma)
    os_beta, os_sigma = os_model.one_step_update()
    os_beta_rmse = norm(os_beta - beta[1:].ravel())
    os_sigma_rmse = norm(os_sigma - sigma_list)
    RMSE_list += [os_beta_rmse, os_sigma_rmse]
    # [2. Rescale]
    os_scale = norm(os_beta)
    os_beta /= os_scale
    os_sigma /= os_scale
    os_beta_rmse = norm(os_beta - beta[1:].ravel())
    os_sigma_rmse = norm(os_sigma - sigma_list)
    RMSE_list += [os_beta_rmse, os_sigma_rmse]
    
    # MLE
    # [1. Raw]
    mle_model = MLE(X1, AY1, A1, K, init_beta, init_sigma)
    mle_beta, mle_sigma = mle_model.NR_alg(max_steps=0, lbd=0.001)
    mle_beta_rmse = norm(mle_beta - beta[1:].ravel())
    mle_sigma_rmse = norm(mle_sigma - sigma_list)
    RMSE_list += [mle_beta_rmse, mle_sigma_rmse]
    # [2. Rescale]
    mle_scale = norm(mle_beta)
    mle_beta /= mle_scale
    mle_sigma /= mle_scale
    mle_beta_rmse = norm(mle_beta - beta[1:].ravel())
    mle_sigma_rmse = norm(mle_sigma - sigma_list)
    RMSE_list += [mle_beta_rmse, mle_sigma_rmse]

    # Oracle
    # [1. Raw]
    oracle_model = ORACLE(X1, AY1, A1, K, init_beta, sigma_list)
    ora_beta = oracle_model.NR_alg(max_steps=20, lbd=0.001)
    ora_beta_rmse = norm(ora_beta - beta[1:].ravel())
    RMSE_list.append(ora_beta_rmse)
    # [2. Rescale]
    ora_scale = norm(ora_beta)
    ora_beta /= ora_scale
    ora_beta_rmse = norm(ora_beta - beta[1:].ravel())
    RMSE_list.append(ora_beta_rmse)

    # Record Results
    RMSE_results.append(RMSE_list)
    a = pd.DataFrame(RMSE_results, 
                     columns=["seed", "n", "alpha", "M",
                              "init_beta", "init_sigma", 
                              "os_beta_raw", "os_sigma_raw",
                              "os_beta_resc", "os_sigma_resc",
                              "mle_beta_raw", "mle_sigma_raw",
                              "mle_beta_resc", "mle_sigma_resc",
                              "oracle_beta_raw", "oracle_beta_resc"])
    a.to_csv(f"/Users/helenology/Desktop/[M={M}]rmse_data.csv")

True Labels 1    39315
0    33292
2    27393
dtype: int64 

