In [None]:
%reload_ext autoreload
%autoreload 2

from sarpu.labeling_mechanisms import label_data
from sarpu.paths_and_names import *
from sarpu.experiments import *

import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display


In [None]:
data_folder= "../Data/"
results_folder="../Results/"

data_name = "mushroom_extclustering"
propensity_attributes = [111,112,113,114]
propensity_attributes_signs = [1,1,1,1]
settings = "lr._.lr._.0-111"
labeling_model_type = "simple_0.2_0.8"

labeling=0
partition=1

nb_assignments=5
nb_labelings=5


relabel_data = False
rerun_experiments = False


pu_methods = ["supervised","negative","sar-e","scar-c","sar-em","scar-km2","scar-tice"]


# Generate PU data with labeling mechanism

In [None]:
labeling_model = label_data(
    data_folder, 
    data_name, 
    labeling_model_type, 
    propensity_attributes, 
    propensity_attributes_signs, 
    nb_assignments,
    relabel_data=relabel_data
)

# Train and Evaluate models

In [None]:
for pu_method in pu_methods:
    train_and_evaluate(
        data_folder,
        results_folder, 
        data_name, 
        labeling_model, 
        labeling, 
        partition, 
        settings, 
        pu_method, 
        rerun_experiments=rerun_experiments
    )
    
experiment_path = experiment_result_folder_path(
    results_folder, 
    data_name, 
    labeling_model, 
    labeling, 
    partition, 
    settings
)

!ls $experiment_path/*


# Evaluate setting

## Labeling Mechanism Properties

### Load dataset

In [None]:
x_path = data_path(data_folder,data_name)
y_path = classlabels_path(data_folder,data_name)
s_path = propensity_labeling_path(data_folder, data_name, labeling_model, labeling)
e_path = propensity_scores_path(data_folder, data_name, labeling_model)
x,y,s,e = read_data((x_path,y_path,s_path,e_path))
model_path = experiment_classifier_path(results_folder, data_name, labeling_model, labeling, partition, settings, "supervised")

y_pred = pickle.load(open(model_path, 'rb')).predict_proba(x)

In [None]:
def line(x,y,nb_vals=10):
    x_vals = sorted(set(x))
    delta=0
    if len(x_vals)>nb_vals:
        x_min = min(x)
        x_max = max(x)
        delta = (x_max-x_min)/nb_vals/2
        x_vals = np.arange(nb_vals)/nb_vals*(x_max-x_min)+x_min+delta
    y_avg = []
    for x_val in x_vals:
        y_avg.append(y[abs(x-x_val)<=delta].mean())
    return x_vals,y_avg


def plot_scatter_and_line(x, y, c=None):
    plt.scatter(x, y, c=c)
    x1,y1 = line(x ,y)
    plt.plot(x1, y1)    
    y2,x2 = line(y, x)
    plt.plot(x2, y2)
    
    

plt.figure()
plt.title("Class probability/Propensity Correlation")
plot_scatter_and_line(y_pred, e, y_pred)
plt.xlabel(r"$\Pr(y=1)$")
plt.ylabel(r"$e$")

for at in propensity_attributes:
    plt.figure()
    plot_scatter_and_line(x[:,at],e,y_pred)
    plt.title("Attribute "+str(at)+"/Propensity Correlation")
    plt.xlabel("attribute "+str(at))
    plt.ylabel(r"$e$")
    
    plt.figure()
    plot_scatter_and_line(x[:,at],y_pred,y)
    plt.title("Attribute "+str(at)+"/Class Probability")
    plt.xlabel("attribute "+str(at))
    plt.ylabel(r"$\Pr(y=1)$")
    
    plt.figure()
    plot_scatter_and_line(x[:,at],y,y)
    plt.title("Attribute "+str(at)+"/Class Probability")
    plt.xlabel("attribute "+str(at))
    plt.ylabel(r"$y$")


## Gather results

In [None]:
frames = []
for pu_method in pu_methods:
    results_path = experiment_results_path(results_folder, data_name, labeling_model, labeling, partition, settings, pu_method)
    df = pd.read_csv(results_path, index_col=0, header=None, sep="\t").T
    df.index=[pu_method]
    
    frames.append(df)
df = pd.concat(frames)
df

## Comparison Classifier

In [None]:
measures_to_plot = ['test_f_roc_auc','test_f_average_precision', 'test_f_mse', 'test_f_mae']

display(df[measures_to_plot])

for measure_to_plot in measures_to_plot:
    plt.figure()
    ax = df[measure_to_plot].plot(kind="bar");
    plt.title(measure_to_plot)
    [ax.text(i,v, '{:.3f}'.format(v)) for i, v in enumerate(df[measure_to_plot])];
    
    

## Comparison Propensity Scores

In [None]:
measures_to_plot = ['test_e_mse','test_e_mae', 'train_e_prior_abs_err']

display(df[measures_to_plot])

for measure_to_plot in measures_to_plot:
    plt.figure()
    ax = df[measure_to_plot].plot(kind="bar");
    plt.title(measure_to_plot)
    [ax.text(i,v, '{:.3f}'.format(v)) for i, v in enumerate(df[measure_to_plot])];
    