In [1]:
# basic (built-in) Python packages
import numpy as np
from numpy.linalg import norm
import pandas as pd
import matplotlib.pyplot as plt
from copy import copy
import sys
sys.path.append('../')

# advanced (built-in) Python packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize

# my implemented Python functions and classes
from generate_data import *
from model.BaseModel import BaseModel
from model.Initial import Initial
from model.OS import OS
from model.MS import MS
from model.ORACLE_beta import ORACLE_beta
from model.ORACLE_sigma import ORACLE_sigma
from utils import *

# Hyperparameters

In [2]:
seed = 1
np.random.seed(seed=seed)

N = 2000        # the size of the unlabeled dataset
r = 0.5
n = int(N * r)            # pilot sample size
alpha = 0.5 # n**(-0.1)  
print(f"alpha={alpha:.4f}")

p = 20          # feature dimension
K = 1           # (K+1) classes
M = int(0.5 * int(n / np.log(n)))          # the size of the annotator pool #int(n / np.log(n))
print(f"[n*alpha={int(n*alpha)}/N={N}]")
print(f"[M={M}] vs [n*alpha={int(n*alpha)}]")

alpha=0.5000
[n*alpha=500/N=2000]
[M=72] vs [n*alpha=500]


In [3]:
# parameters - beta
beta = np.random.randn(K+1, p)
beta[0] = 0
beta_norm = norm(beta)
beta = beta / beta_norm

# annotator sigma
# sigma = np.arange(0.1, 4.1, 4/M)
sigma = np.ones(M) * 2
sigma[0:int(M/2)] = 0.5

# parameter vector
theta = np.zeros(K * p + M)
theta[:(p*K)] = beta[1:].ravel()
theta[(p*K):] = sigma.reshape(-1)

# Data Generation

In [4]:
X, Y, X1, X2, Y1, Y2, A1, AY1, pilot_ids, rest_ids = generate_data(K, p, N, n, M, alpha, beta, sigma, seed=0)

True Labels 0    1072
1     928
dtype: int64 



In [5]:
sigma

array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
       0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
       0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 2. , 2. , 2. ,
       2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. ,
       2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. , 2. ,
       2. , 2. , 2. , 2. , 2. , 2. , 2. ])

# Initial Estimator

In [6]:
init_model = Initial(X1, AY1, A1, K)
init_beta, init_sigma, init_betams = init_model.init_param()

In [7]:
init_beta_rmse = compute_rmse(init_beta, beta[1:])
init_sigma_rmse = compute_rmse(init_sigma, sigma)
print(f"Init beta:  {init_beta_rmse:.7f}")
print(f"Init sigma: {init_sigma_rmse:.7f}")

Init beta:  0.0077006
Init sigma: 0.1773317


# One Step / Two Step / Multiple Step

- OS:

In [8]:
os_model = OS(X1, AY1, A1, K, alpha, init_beta, init_sigma)
os_beta, os_sigma = os_model.update_alg(max_steps=1, tol=1e-5)

os_beta_rmse = compute_rmse(os_beta, beta[1:])
os_sigma_rmse = compute_rmse(os_sigma, sigma)
print(f"\nOS beta: {os_beta_rmse:.7f}")
print(f"OS sigma: {os_sigma_rmse:.7f}")

######## [Step 1] ########
norm(gradient): 1.0991745

OS beta: 0.0054805
OS sigma: 0.1239398


- TS:

In [9]:
ts_model = MS(X1, AY1, A1, K, alpha, init_beta, init_sigma)
ts_beta, ts_sigma = ts_model.update_alg(max_steps=2, true_beta=beta[1:].ravel())

######## [Step 1] ########
norm(beta_gradient): 1.0926272
norm(sigma_gradient): 0.1197932
RMSE(beta): 0.0245094
######## [Step 2] ########
norm(beta_gradient): 0.1668051
RMSE(beta): 0.0261148


In [10]:
ts_beta_rmse = compute_rmse(ts_beta, beta[1:])
ts_sigma_rmse = compute_rmse(ts_sigma, sigma)
print(f"TS beta: {ts_beta_rmse:.7f}")
print(f"TS sigma: {ts_sigma_rmse:.7f}")

TS beta: 0.0058394
TS sigma: 0.1239398


- MS:

In [11]:
ms_model = MS(X1, AY1, A1, K, alpha, init_beta, init_sigma)
ms_beta, ms_sigma = ms_model.update_alg(max_steps=3, true_beta=beta[1:].ravel(), echo=False)

In [12]:
ms_beta_rmse = compute_rmse(ms_beta, beta[1:])
ms_sigma_rmse = compute_rmse(ms_sigma, sigma)
print(f"MS beta:  {ms_beta_rmse:.7f}")
print(f"MS sigma: {ms_sigma_rmse:.7f}")

MS beta:  0.0058325
MS sigma: 0.1239398


# Oracle_beta

In [13]:
orab_model = ORACLE_beta(X1, AY1, A1, K, init_beta, sigma)
oracle_beta = orab_model.update_alg(max_steps=5, tol=1e-5, true_beta=beta[1:])

oracle_beta_rmse = compute_rmse(oracle_beta, beta[1:])
print(f"Oracle beta: {oracle_beta_rmse:.7f}")

######## [Step 1] ########
norm(gradient): 0.7765415
RMSE(beta): 0.0286215
######## [Step 2] ########
norm(gradient): 0.0149188
RMSE(beta): 0.0289121
######## [Step 3] ########
norm(gradient): 0.0000147
RMSE(beta): 0.0289125
Oracle beta: 0.0056847


In [14]:
# diff_mom, diff_son = orabeta_model.check(beta[1:], sigma)
# plt.boxplot([diff_mom, diff_son.ravel()])
# plt.show()

# Oracle_sigma

In [15]:
oras_model = ORACLE_sigma(X1, AY1, A1, K, init_beta, init_sigma)
oracle_sigma = oras_model.update_alg(max_steps=10, tol=1e-5, true_sigma=sigma)

######## [Step 1] ########
norm(sigma): 0.1197932
RMSE(sigma): 1.0516645
######## [Step 2] ########
norm(sigma): 0.0203683
RMSE(sigma): 1.0862621
######## [Step 3] ########
norm(sigma): 0.0014915
RMSE(sigma): 1.0935520
######## [Step 4] ########
norm(sigma): 0.0000369
RMSE(sigma): 1.0935698
######## [Step 5] ########
norm(sigma): 0.0000000
RMSE(sigma): 1.0935698


In [16]:
oracle_sigma_rmse = compute_rmse(oracle_sigma, sigma)
print(f"Oracle sigma: {oracle_sigma_rmse:.7f}")

Oracle sigma: 0.1288784


# Results

In [17]:
print(f"Init beta: {init_beta_rmse:.7f}")
print(f"OS   beta: {os_beta_rmse:.7f}")
print(f"TS   beta: {ts_beta_rmse:.7f}")
print(f"MS   beta: {ms_beta_rmse:.7f}")
print(f"Ora  beta: {oracle_beta_rmse:.7f}")

Init beta: 0.0077006
OS   beta: 0.0054805
TS   beta: 0.0058394
MS   beta: 0.0058325
Ora  beta: 0.0056847


In [18]:
print(f"Init sigma: {init_sigma_rmse:.7f}")
print(f"OS   sigma: {os_sigma_rmse:.7f}")
print(f"TS   sigma: {ts_sigma_rmse:.7f}")
print(f"MS   sigma: {ms_sigma_rmse:.7f}")
print(f"Ora  sigma: {oracle_sigma_rmse:.7f}")

Init sigma: 0.1773317
OS   sigma: 0.1239398
TS   sigma: 0.1239398
MS   sigma: 0.1239398
Ora  sigma: 0.1288784


# Compare Experiments

In [65]:
from multiprocessing import Pool
import os, time, random

In [66]:
p = 20                           # feature dimension
K = 2                            # (K+1) classes
r = 0.5
B = 1000

In [67]:
alpha_list = [0.25, 0.5, 0.75, 1]
N_list = [2000, 5000, 10000,]

In [68]:
# parameters - beta
np.random.seed(0)
beta = np.random.randn(K+1, p)
beta[0] = 0
beta_norm = norm(beta)
beta = beta / beta_norm

In [73]:
# for N in N_list:
#     n = int(N * r)                 # pilot sample size
#     M = int(0.5 * n / np.log(n))   # the size of the annotator pool
#     # annotator sigma
#     sigma = np.ones(M) * 2
#     sigma[0:int(M/2)] = 0.5
#     # parameter vector
#     theta = np.zeros(K * p + M)
#     theta[:(p*K)] = beta[1:].ravel()
#     theta[(p*K):] = sigma.reshape(-1)
    
#     for alpha in alpha_list:
#         print(f"[n*alpha={int(n*alpha)}/N={N}]")
#         print(f"[M={M}] vs [n*alpha={int(n*alpha)}]")
#         alphas = np.ones(M) * alpha
        
#         for seed in range(B):
#             if (N == 2000) and seed < int(B/2):
#                 continue
#             RMSE_list = [seed, n, alpha, M, K]
#             NAME_list = ["seed", "n", "alpha", "M", "K"]
#             np.random.seed(seed)
#             X, Y, X1, X2, Y1, Y2, A1, AY1, pilot_ids, rest_ids = generate_data(K, p, N, n, M, alpha, beta, sigma, seed=seed)
        
#             # Initial Estimator
#             print(f"\n########## Initial Estimator ##########")
#             init_model = Initial(X1, AY1, A1, K)
#             init_beta, init_sigma, init_betams = init_model.init_param()
#             init_beta_rmse = compute_rmse(init_beta, beta[1:])
#             init_sigma_rmse = compute_rmse(init_sigma, sigma)
#             RMSE_list += [init_beta_rmse, init_sigma_rmse]
#             NAME_list += ["init_beta", "init_sigma"]
            
#             # OS (One-Step) Estimator 
#             print(f"\n########## OS Estimator ##########")
#             os_model = OS(X1, AY1, A1, K, alphas, init_beta, init_sigma)
#             os_beta, os_sigma = os_model.update_alg(max_steps=1, true_beta=beta[1:].ravel())
#             os_beta_rmse = compute_rmse(os_beta, beta[1:])
#             os_sigma_rmse = compute_rmse(os_sigma, sigma)
#             RMSE_list += [os_beta_rmse, os_sigma_rmse]
#             NAME_list += ["os_beta", "os_sigma"]
        
#             # TS (Two-Step) Estimator 
#             print(f"\n########## TS Estimator ##########")
#             ts_model = MS(X1, AY1, A1, K, alphas, init_beta, init_sigma)
#             ts_beta, ts_sigma = ts_model.update_alg(max_steps=2, true_beta=beta[1:].ravel())
#             ts_beta_rmse = compute_rmse(ts_beta, beta[1:])
#             ts_sigma_rmse = compute_rmse(ts_sigma, sigma)
#             RMSE_list += [ts_beta_rmse, ts_sigma_rmse]
#             NAME_list += ["ts_beta", "ts_sigma"]
        
#             # MS (Multiple-Step) Estimator 
#             print(f"\n########## MS Estimator ##########")
#             ms_model = MS(X1, AY1, A1, K, alphas, init_beta, init_sigma)
#             ms_beta, ms_sigma = ms_model.update_alg(max_steps=3, true_beta=beta[1:].ravel())
#             ms_beta_rmse = compute_rmse(ms_beta, beta[1:])
#             ms_sigma_rmse = compute_rmse(ms_sigma, sigma)
#             RMSE_list += [ms_beta_rmse, ms_sigma_rmse]
#             NAME_list += ["ms_beta", "ms_sigma"]
        
#             # Oracle_beta
#             print(f"\n########## ORACLE_beta Estimator ##########")
#             oracle_model = ORACLE_beta(X1, AY1, A1, K, init_beta, sigma)
#             oracle_beta = oracle_model.update_alg(max_steps=10, tol=1e-5, true_beta=beta[1:])
#             oracle_beta_rmse = compute_rmse(oracle_beta, beta[1:])
            
#             # Oracle_sigma
#             print(f"\n########## ORACLE_sigma Estimator ##########")
#             oracle_model = ORACLE_sigma(X1, AY1, A1, K, beta[1:], init_sigma)
#             oracle_sigma = oracle_model.update_alg(max_steps=10, tol=1e-5, true_sigma=sigma)
#             oracle_sigma_rmse = compute_rmse(oracle_sigma, sigma)
#             RMSE_list += [oracle_beta_rmse, oracle_sigma_rmse]
#             NAME_list += ["oracle_beta", "oracle_sigma"]
        
#             # Print Results
#             print()
#             print(f"Init beta: {init_beta_rmse:.7f}")
#             print(f"Ora  beta: {oracle_beta_rmse:.7f}")
#             print(f"OS   beta: {os_beta_rmse:.7f}")
#             print(f"TS   beta: {ts_beta_rmse:.7f}")
#             print(f"MS   beta: {ms_beta_rmse:.7f}")
#             print()
#             print(f"Init sigma: {init_sigma_rmse:.7f}")
#             print(f"Ora  sigma: {oracle_sigma_rmse:.7f}")
#             print(f"OS   sigma: {os_sigma_rmse:.7f}")
#             print(f"TS   sigma: {ts_sigma_rmse:.7f}")
#             print(f"MS   sigma: {ms_sigma_rmse:.7f}")
        
#             # Record Results
#             RMSE_results.append(RMSE_list)
#             a = pd.DataFrame(RMSE_results, columns=NAME_list)
#             a.to_csv(f"./results/[K={K}]rmse_data.csv")
            
#             #########################################################################
#             Avar = ts_model.compute_Avar(ts_model.beta, ts_sigma)
#             beta_hat = np.zeros((K+1, p))
#             beta_hat[1:] = ts_beta.reshape(K, p)
#             sigma_hat = copy(ts_sigma)

#             # Predicted Label and MaxMis
#             Y_hat = np.argmax(X.dot(np.transpose(beta_hat)), axis=1)  # predicted label
#             MaxMis = [compute_MaxMis_i(X[i], beta_hat, Avar, n, M, alpha, K, p) for i in range(N)]
#             MaxMis = np.array(MaxMis)
#             dat = pd.DataFrame({"Y": Y, "Y_hat": Y_hat, "MaxMis": MaxMis,})
#             dat = dat.sort_values(by="MaxMis", ascending=False)
#             dat = dat.reset_index(drop=True)
#             dat["MaxMis_p"] = Phi(dat["MaxMis"])
#             dat["rank"] = np.arange(N)
#             dat["mislabeled"] = (dat["Y"] != dat["Y_hat"]).astype(int)
            
#             # Overall Mislabeling Rate (OMR)
#             OMR = dat["mislabeled"].sum() / N
#             result = [seed, N, n, alpha, M, K, OMR]
#             for w in [0.1, 0.05, 0.01]:
#                 machine = dat[dat["MaxMis_p"] <= w]
#                 human = dat[dat["MaxMis_p"] > w]
#                 CMR = machine.mislabeled.sum() / machine.shape[0]
#                 ELC = human.shape[0] / N
#                 result += [CMR, ELC]

#             results.append(result)
#             a = pd.DataFrame(results, 
#                              columns=["seed", "N", "n", "alpha", "M", "K", "OMR", "CMR(0.1)",
#                                       "ELC(0.1)", "CMR(0.05)", "ELC(0.05)", "CMR(0.01)", "ELC(0.01)", 
#                                      ])
#             a.to_csv(f"./results/[K={K}]MAC_data.csv")

In [74]:
def task(N, alpha, seed):
    n = int(N * r)                 # pilot sample size
    M = int(0.5 * n / np.log(n))   # the size of the annotator pool
    # annotator sigma
    sigma = np.ones(M) * 2
    sigma[0:int(M/2)] = 0.5
    alphas = np.ones(M) * alpha
    # parameter vector
    theta = np.zeros(K * p + M)
    theta[:(p*K)] = beta[1:].ravel()
    theta[(p*K):] = sigma.reshape(-1)

    np.random.seed(seed)
    X, Y, X1, X2, Y1, Y2, A1, AY1, pilot_ids, rest_ids = generate_data(K, p, N, n, M, alpha, beta, sigma, seed=seed)

    # Initial Estimator
    print(f"\n########## Initial Estimator ##########")
    init_model = Initial(X1, AY1, A1, K)
    init_beta, init_sigma, init_betams = init_model.init_param()
    init_beta_rmse = compute_rmse(init_beta, beta[1:])
    init_sigma_rmse = compute_rmse(init_sigma, sigma)

    # OS (One-Step) Estimator 
    print(f"\n########## OS Estimator ##########")
    os_model = OS(X1, AY1, A1, K, alphas, init_beta, init_sigma)
    os_beta, os_sigma = os_model.update_alg(max_steps=1)
    os_beta_rmse = compute_rmse(os_beta, beta[1:])
    os_sigma_rmse = compute_rmse(os_sigma, sigma)

    # TS (Two-Step) Estimator 
    print(f"\n########## TS Estimator ##########")
    ts_model = MS(X1, AY1, A1, K, alphas, init_beta, init_sigma)
    ts_beta, ts_sigma = ts_model.update_alg(max_steps=2)
    ts_beta_rmse = compute_rmse(ts_beta, beta[1:])
    ts_sigma_rmse = compute_rmse(ts_sigma, sigma)

    # MS (Multiple-Step) Estimator 
    print(f"\n########## MS Estimator ##########")
    ms_model = MS(X1, AY1, A1, K, alphas, init_beta, init_sigma)
    ms_beta, ms_sigma = ms_model.update_alg(max_steps=3)
    ms_beta_rmse = compute_rmse(ms_beta, beta[1:])
    ms_sigma_rmse = compute_rmse(ms_sigma, sigma)

    # Oracle_beta
    print(f"\n########## ORACLE_beta Estimator ##########")
    oracle_model = ORACLE_beta(X1, AY1, A1, K, init_beta, sigma)
    oracle_beta = oracle_model.update_alg(max_steps=10, tol=1e-5)
    oracle_beta_rmse = compute_rmse(oracle_beta, beta[1:])

    # Oracle_sigma
    print(f"\n########## ORACLE_sigma Estimator ##########")
    oracle_model = ORACLE_sigma(X1, AY1, A1, K, beta[1:], init_sigma)
    oracle_sigma = oracle_model.update_alg(max_steps=10, tol=1e-5)
    oracle_sigma_rmse = compute_rmse(oracle_sigma, sigma)
    
    RMSE_list = [seed, N, n, alpha, M, K, 
                 init_beta_rmse, init_sigma_rmse, 
                 os_beta_rmse, os_sigma_rmse,
                 ts_beta_rmse, ts_sigma_rmse,
                 ms_beta_rmse, ms_sigma_rmse,
                 oracle_beta_rmse, oracle_sigma_rmse]

    #########################################################################
    Avar = ts_model.compute_Avar(ts_model.beta, ts_sigma)
    beta_hat = np.zeros((K+1, p))
    beta_hat[1:] = ts_beta.reshape(K, p)
    sigma_hat = copy(ts_sigma)

    # Predicted Label and MaxMis
    Y_hat = np.argmax(X.dot(np.transpose(beta_hat)), axis=1)  # predicted label
    MaxMis = [compute_MaxMis_i(X[i], beta_hat, Avar, n, M, alpha, K, p) for i in range(N)]
    MaxMis = np.array(MaxMis)
    dat = pd.DataFrame({"Y": Y, "Y_hat": Y_hat, "MaxMis": MaxMis,})
    dat = dat.sort_values(by="MaxMis", ascending=False)
    dat = dat.reset_index(drop=True)
    dat["MaxMis_p"] = Phi(dat["MaxMis"])
    dat["rank"] = np.arange(N)
    dat["mislabeled"] = (dat["Y"] != dat["Y_hat"]).astype(int)

    # Overall Mislabeling Rate (OMR)
    OMR = dat["mislabeled"].sum() / N
    result = [seed, N, n, alpha, M, K, OMR]
    for w in [0.1, 0.05, 0.01]:
        machine = dat[dat["MaxMis_p"] <= w]
        human = dat[dat["MaxMis_p"] > w]
        CMR = machine.mislabeled.sum() / machine.shape[0]
        ELC = human.shape[0] / N
        result += [CMR, ELC]

    return RMSE_list, result

In [76]:
results = []
RMSE_list = []
MAC_list = []

In [None]:
print('Parent process %s.' % os.getpid())
pool = Pool(12)
for N in N_list:
    for alpha in alpha_list:
        for seed in range(B):
            print()
            print(f"===========")
            print(N, alpha, seed)
            print(f"===========")
            print()
            tmp = pool.apply_async(task, args=(N, alpha, seed,))
            results.append(tmp)

pool.close()
pool.join()

Parent process 30656.
True LabelsTrue LabelsTrue LabelsTrue LabelsTrue Labels True Labels True LabelsTrue Labels True Labels2    771
1    699
0    530
dtype: int64True Labels True Labels2    742
1    735
0    523
dtype: int642    769
1    709
0    522
dtype: int64   True Labels  2    741
1    700
0    559
dtype: int64 2    765
1    679
0    556
dtype: int642    778
1    686
0    536
dtype: int64   
1    742
2    729
0    529
dtype: int64  2    746
1    730
0    524
dtype: int64

2    760
1    694
0    546
dtype: int642    765
1    719
0    516
dtype: int64
 2    743
1    724
0    533
dtype: int64   

 
2    781
1    724
0    495
dtype: int64 

 


  












2000 0.25 0


2000 0.25 1


2000 0.25 2


2000 0.25 3


2000 0.25 4


2000 0.25 5


2000 0.25 6


2000 0.25 7


2000 0.25 8


2000 0.25 9


2000 0.25 10


2000 0.25 11


2000 0.25 12


2000 0.25 13


2000 0.25 14


2000 0.25 15


2000 0.25 16


2000 0.25 17


2000 0.25 18


2000 0.25 19


2000 0.25 20


2000 0.25 21


2000 0.25

In [None]:
for i in range(len(results)):
    a, b = results[i].get()
    RMSE_list.append(a)
    MAC_list.append(b)

In [None]:
RMSE_DF = pd.DataFrame(RMSE_list, columns=["seed", "N", "n", "alpha", "M", "K",
                                           "init_beta", "init_sigma", "os_beta", "os_sigma",
                                           "ts_beta", "ts_sigma", "ms_beta", "ms_sigma",
                                           "oracle_beta", "oracle_sigma"])
RMSE_DF.to_csv(f"./results/[K={K}]RMSE_data.csv")

In [None]:
MAC_DF = pd.DataFrame(MAC_list, columns=["seed", "N", "n", "alpha", "M", "K", "OMR",
                                         "CMR(0.1)", "CMR(0.05)", "CMR(0.01)",
                                         "ELC(0.1)", "ELC(0.05)", "ELC(0.01)",
                                        ])
MAC_DF.to_csv(f"./results/[K={K}]MAC_DF.csv")

In [82]:
RMSE_DF.shape

(12000, 16)

In [84]:
MAC_DF.shape

(12000, 13)