# Probabilistic label spreading

Here, we set up the experiment, i.e. we specify for which hyperparameters the prob. label spreading should be conducted and run the experiment.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import json
from tqdm import tqdm

from scripts.plot import *
from scripts.baseline_prob_label_spreading import *
from scripts.probabilistic_label_spreading_print_runtime import prob_label_spreading

plot_params = set_plot_layout(path_to_latex = '/home/jklees/texlive/bin/x86_64-linux') # set plot layout (optional)C

In [2]:
# fix parameters
k = 20
alpha = 0.9
n_samples = 1000 # we provide 1000 feedbacks i.e., the linear system needs to be solved 1000 times

# Two Moons: 1000 data points

In [3]:
dataset_name = "TwoMoons" # "EMNIST", "TinyImageNet"
path_to_dataset = "data/prob_data/"+ dataset_name + "/" + dataset_name + ".pkl"

df = pd.read_pickle(path_to_dataset)

classes = list(set(df["label"]))

len(df)

1000

In [4]:
n_data = "all"
data_space = "feature"
prob_label_column = "prob_label" #  

results, processed_data, time_spread = prob_label_spreading(dataset_name, df, data_space, prob_label_column, n_data, k, alpha, n_samples)
print(f"spreading time: {time_spread:.2f} s")

spreading time: 0.82 s


# CIFAR-10-H 10,000 Data points

In [16]:
dataset_name = "CIFAR10-H" 
path_to_dataset = "data/prob_data/"+ dataset_name + "/" + dataset_name + ".pkl"

df = pd.read_pickle(path_to_dataset)

classes = list(set(df["label"]))

len(df)

10000

In [18]:
n_data = "all"
data_space = "CLIP_UMAP_20"
prob_label_column = "prob_label" #  

results, processed_data, time_spread = prob_label_spreading(dataset_name, df, data_space, prob_label_column, n_data, k, alpha, n_samples)
print(f"spreading time: {time_spread:.2f} s")

spreading time: 2.86 s


# Tiny ImageNet 100,000 Data points

In [7]:
dataset_name = "TinyImageNet" 
path_to_dataset = "data/prob_data/"+ dataset_name + "/" + dataset_name + ".pkl"

df = pd.read_pickle(path_to_dataset)

classes = list(set(df["label"]))

len(df)

100000

In [8]:
n_data = "all"
data_space = "CLIP_UMAP_20"
prob_label_column = "prob_label_effnetb0" #  

results, processed_data, time_spread = prob_label_spreading(dataset_name, df, data_space, prob_label_column, n_data, k, alpha, n_samples)
print(f"spreading time: {time_spread:.2f} s")

spreading time: 11.82 s


# EEMNIST (Augmented EMNIST-digits) 1,000,000 Data points

In [10]:
dataset_name = "EEMNIST" 
# path_to_dataset = "data/prob_data/"+ dataset_name + "/" + dataset_name + ".pkl"
path_to_dataset = "/home/jklees/backup_prob_data/"+ dataset_name + "/" + dataset_name + ".pkl"


df = pd.read_pickle(path_to_dataset)

classes = list(set(df["label"]))

len(df)

1000000

In [14]:
# keep only the relevant parts of the data for this experiment to reduce memory consumption
data_spaces = [data_space]
prob_label_columns = [prob_label_column]

if all(col in df.columns for col in data_spaces):
    columns = data_spaces + ["label"] + prob_label_columns
    dff = df[columns] 

In [15]:
n_data = "all"
data_space = "CLIP_UMAP_20"
prob_label_column = "prob_label_effnetb0" #  

results, processed_data, time_spread = prob_label_spreading(dataset_name, dff, data_space, prob_label_column, n_data, k, alpha, n_samples)
print(f"spreading time: {time_spread:.2f} s")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = [x.argmax() for x in spreaded_info]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['trials'] = trials
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["p_hat"] = [ (spreaded_info[i] + ((1e-4)/len(classes))) /(trials[i] + 1e-4) for i in range(len(trials))]


spreading time: 260.26 s
