# biased_sgd: expertiments reproduction

Tested on MacOS and Linux

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import time
import sys
import os
import argparse
from numpy.linalg import norm
import itertools
import pandas as pd
from matplotlib import pyplot as plt
import math
import datetime
from IPython import display

from contextlib import redirect_stdout
import shutil
import subprocess
import matplotlib.ticker as tck

import matplotlib.colors as mcolors
import matplotlib.markers as mmarkers

intrepr = lambda x: int(x) if x.is_integer() else round(x,8)
myrepr = lambda x: repr(round(x, 8)).replace('.',',') if isinstance(x, float) else repr(x)

## Datasets preprocessing

The next cell creates bash script that launches a necessary dataset preprocessing that is required before launching experients

In [None]:
%%writefile preprocess_datasets.sh
#!/bin/bash

datasets=("splice" "a9a" "w8a")
loss_func="log-reg"
la="1"

for dataset in "${datasets[@]}"; do
    echo "Running script for dataset: ${dataset} with --la: ${la}"
    python3 b_sgd_data_preprocessing.py --dataset "${dataset}" --loss_func "${loss_func}" --la "${la}"
done



In [None]:
!cat preprocess_datasets.sh

In [None]:
!bash preprocess_datasets.sh

## Run experiments

The next cell creates bash script that launches experiments

In [None]:
%%writefile experiments_run.sh
#!/bin/bash

datasets=("splice" "a9a" "w8a")
la_values=("1.0")
prb_values=("0.01" "0.1" "0.5")


setting_type="biased"
prb_type="uniform"
factor=1
importance_normed_probs=0

max_comms=5000
max_epochs=5000
tol=1e-10

datasets_length=${#datasets[@]}
la_length=${#la_values[@]}
prb_length=${#prb_values[@]}
product_length=$((datasets_length * la_length * prb_length))
echo "Number of scripts to launch: $product_length"

for dataset in "${datasets[@]}"; do
  for la in "${la_values[@]}"; do
    for prb in "${prb_values[@]}"; do
      python3 sgd-ind.py --factor $factor --tol $tol --dataset $dataset --prb $prb --prb_type $prb_type --setting_type $setting_type --max_epochs $max_epochs --max_comms $max_comms --la "${la}" --importance_normed_probs $importance_normed_probs
    done
  done
done


In [None]:
!cat experiments_run.sh

In [None]:
!bash experiments_run.sh

## Draw plots

The next cell draw plots and saves an output as vector image in the folder "plot_all-datasets"

In [None]:
def iter_to_bits(num_iters, k_ar,  size_value):
    return (num_iters, int (size_value*num_iters*np.max(k_ar)))

def bits_to_iter(num_bits, k_ar,  size_value):
    return (int (num_bits/(size_value*np.min(k_ar))), num_bits )

def fix_shape(np_array):
    if len (np_array.shape)==2:
        if np_array.shape[0]==1:
            np_array = np_array.flatten()
        else: 
            raise ValueError("wrong shape")
    return np_array
def load_np_array (path_to_file, pickle=True):
    try:
        np_array = np.load(path_to_file, allow_pickle=pickle)
        #its = np.load(logs_file_its)
    except IOError:
        print (path_to_file+": its failed to be loaded: IOError")
        np_array = np.array([-1])
    return np_array
def get_plot_path(save_separately, project_path, dataset):
    if save_separately:
        plot_path = project_path + "plot_{0}/".format(dataset)
    else:
        plot_path = project_path + "plot_all-datasets/"
    return plot_path
def get_ub_x(cut_axis, x_axis, s, else_value):
    #do not use; not ready
    if cut_axis:
        ub_x = {'iteration_bits_od':{0.0:100_000, 0.2:100_000, 0.8:100_000, 1.6:100_000, 6.4:100_000, 12.8:100_000, 25.6:100_000}}[x_axis][s]
    else:
        ub_x = else_value
    return ub_x

def load_all_logs (experiment_ar, dataset, project_path, ub_x, x_axis, y_axis):
    its_ar = []
    grad_norms_ar= []
    its_last_value = np.zeros(len(experiment_ar))
    grad_norms_last_value = np.zeros(len(experiment_ar))
    is_ind_uploaded = np.zeros(len(experiment_ar), dtype=int)
    for i, experiment in enumerate(experiment_ar):

        logs_path = project_path + "logs/logs_{0}_{1}/".format(dataset, experiment)
        logs_file_its = logs_path + x_axis + "_" + experiment + ".npy"
        logs_file_norms = logs_path + y_axis + '_' + experiment+'.npy'
        if os.path.isfile(logs_file_its):
            is_ind_uploaded[i] = 1
            its = fix_shape(load_np_array(logs_file_its))          

            number_its = len(its[its < ub_x])
            its_ar.append(its[:number_its])
            
            norms = fix_shape(load_np_array(logs_file_norms)  )              
            grad_norms_ar.append(norms[:number_its])
            
            #debug section
            #np.save (logs_path + x_axis + "_" + experiment, np.arange(ub_x))
            #np.save (logs_path + y_axis + "_" + experiment, norms[:ub_x])
            
        else:
            is_ind_uploaded[i] = 0
            its_ar.append(np.array([-1])) #emplhasising the error
            grad_norms_ar.append(np.array([-1]))
            print (logs_file_its + " is not computed")

        grad_norms_last_value[i] = grad_norms_ar[-1][-1]
        its_last_value[i] = its_ar[-1][-1]
        if print_each_exp:
            #print ("%34s iter: %8d; norms: %9.2e;  bits/n: %8d;"%(label_ar[i], its_ar[-1].shape[0], grad_norms_ar[-1][-1], its_ar[-1][-1] ) )
            print ("%d: %34s iter: %8d; %6s: %9.2e; %6s: %8d; grad_norms_shape: %8d; shapes_equality: %1d"%(i, label_ar[i], its_ar[-1].shape[0], y_axis, grad_norms_ar[-1][-1], x_axis, its_ar[-1][-1], grad_norms_ar[-1].shape[0], grad_norms_ar[-1].shape[0]==its_ar[-1].shape[0]) )      
    return its_ar, grad_norms_ar, its_last_value, grad_norms_last_value, is_ind_uploaded

def get_min_params (grad_norms_last_value, its_last_value, is_ind_uploaded, print_min, dict_type_output, label_ar, its_ar, grad_norms_ar, tol, x_axis, y_axis):
    if print_min:
        grad_norms_last_value_m = grad_norms_last_value.copy()
        its_last_value_m = its_last_value.copy()

        non_loaded_inds = np.argwhere(is_ind_uploaded==0).flatten()
        above_tol_inds =  np.argwhere( grad_norms_last_value_m[~np.isnan(grad_norms_last_value_m)] > tol).flatten()
        inf_inds = np.argwhere(np.isinf(grad_norms_last_value_m)).flatten()
        nan_inds = np.argwhere(np.isnan(grad_norms_last_value_m)).flatten()
        upd_inds = np.unique (np.concatenate((non_loaded_inds, above_tol_inds, inf_inds, nan_inds)))

        if minimize_over == "grad_norms":
            grad_norms_last_value_m[upd_inds] = np.inf
            it_min = np.argmin (grad_norms_last_value_m)
        elif minimize_over == "its":
            its_last_value_m[upd_inds] = np.inf
            it_min = np.argmin (its_last_value_m)
        else: 
            raise ValueError("wrong axis name")

        if dict_type_output:
            print (label_ar[it_min])
        else:
            print ("\n %34s iter: %8d; %6s: %9.2e  %6s: %8d \n"%(label_ar[it_min], its_ar[it_min].shape[0], y_axis, grad_norms_ar[it_min][-1], x_axis, its_last_value[it_min]))
            
def get_nan_dataframes(experiment_ar, grad_norms_last_value,its_ar,grad_norms_ar, nan_investigate, df_generate):
    if nan_investigate:
        nan_inds = np.argwhere(np.isnan(grad_norms_last_value)).flatten()
        if df_generate:
            df_its_list = []
            df_grad_norms_list = []
            for i in nan_inds:
                df_its_list.append(pd.DataFrame({experiment_ar[i]:its_ar[i]}))
                df_grad_norms_list.append(pd.DataFrame({experiment_ar[i]:grad_norms_ar[i]}))
            df_its = pd.concat(df_its_list, ignore_index=False, axis=1)
            df_norms = pd.concat(df_grad_norms_list, ignore_index=False, axis=1)
            return df_its, df_norms, nan_inds
        else: 
            return pd.DataFrame(),pd.DataFrame(),nan_inds
            
    else:
        return pd.DataFrame(),pd.DataFrame(), np.array([])



def non_local_methods_params(setting_type_ar, prb_type_ar, prb_ar, factor_ar, la_ar):
    return list(itertools.product (setting_type_ar, prb_type_ar, prb_ar, factor_ar, la_ar))

def cut_logs_freq (its_ar, grad_norms_ar, freq):
    cutted_its_ar = []
    cutted_grad_norms_ar = []
    for i in range(len(its_ar)):
        mask = np.arange(start=0, stop=its_ar[i].shape[0], step=freq)
        cutted_its_ar.append(its_ar[i][mask])
        cutted_grad_norms_ar.append(grad_norms_ar[i][mask])
    return cutted_its_ar, cutted_grad_norms_ar

def draw_merged_plots(fig_ax_ar, plot_path, filename, x_label, plots_titles, y_label, save, legend_location, ymin=None, ymax=None, xlim=None):
    size = 40
    marker_size = 30
    #plt.rcParams['font.family'] = 'serif'
    #plt.rcParams['font.serif'] = 'FreeSerif'
    plt.rcParams['lines.linewidth'] = 4
    plt.rcParams['xtick.labelsize'] = size
    plt.rcParams['ytick.labelsize'] = size
    plt.rcParams['legend.fontsize'] = 30
    plt.rcParams['axes.titlesize'] = size
    plt.rcParams['axes.labelsize'] = size
    plt.rcParams["figure.figsize"] = [50,12]

    (fig, axs) = fig_ax_ar
    
    for j in range(len(plots_titles)):
        axs[j].set_xlabel(x_label, fontsize=size)
        axs[j].set_ylabel(y_label, fontsize=size)
        axs[j].set_title(f"{plots_titles[j]}", fontsize=30)
        axs[j].set_yscale('log')

        legend = axs[j].legend(loc=legend_location, framealpha=0.5)
        axs[j].grid()

        #x_min, x_max = np.min(axs[j].get_lines()[0].get_xdata()), np.max(axs[j].get_lines()[0].get_xdata())
        #y_min, y_max = np.min(axs[j].get_lines()[0].get_ydata()), np.max(axs[j].get_lines()[0].get_ydata())

        #axs[j].set_xlim(x_min*0.995, x_max*1.0001)
        #axs[j].set_ylim(y_min*0.995, y_max*1.0001)
        
        #axs[j].set_xlim(x_min, x_max)
        #axs[j].set_ylim(y_min, y_max)

        axs[j].locator_params(axis='x', nbins=4)
        #axs[j].locator_params(axis='y', numticks=10)
        
        #locmin = tck.LogLocator(base=10.0, subs=(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8), numticks=10)
        #axs[j].yaxis.set_minor_locator(locmin)
        axs[j].yaxis.set_minor_formatter(tck.LogFormatter())
        #axs[j].yaxis.set_minor_formatter(tck.FormatStrFormatter('%.2f'))


    where = plot_path + "_0_" + filename
    plt.show()
    if save:
        print(f"saving to: {where}...")
        fig.savefig(plot_path + "_0_" + filename, bbox_inches='tight')
        print(f"saving is finished!")


project_path = os.getcwd() + "/"

loss_func = "log-reg"
dim_dict = {"mushrooms":112, "w8a":300, "a9a":123, "realsim":20958,"splice":60}
dataset_size_dict = {"mushrooms": 8124, "w8a": 49749, "a9a": 32561, "realsim": 72309, "splice":1000}

biased_tuning_factors_dict = {"mushrooms":[65536.0, 131072.0, 262144.0, 524288.0, 1048576.0],
                              "w8a":[262144.0, 524288.0, 1048576.0],
                              "a9a":[32768.0, 65536.0, 131072.0, 262144.0, 524288.0, 1048576.0]}

optimal_biased_factors = {"mushrooms":[131072.0*2, 131072.0*4], "w8a":[1048576.0,1048576.0*2], "a9a":[131072.0*2,131072.0*4]}
optimal_biased_factors = {"mushrooms":[131072.0*2], "w8a":[1048576.0], "a9a":[131072.0*4]} 


color_ar_1 = ['blue', 'orange','red', 'aqua', 'violet','cornflowerblue', 'darkgreen', 'coral', 'lime', 'darkgreen', 'goldenrod', 'maroon','black', 'brown', 'yellowgreen',
              "purple", "violet", "magenta", "green","chocolate","crimson"]
all_colors = mcolors.CSS4_COLORS
remaining_colors = list(set(all_colors.keys()) - set(color_ar_1))
color_ar_1.extend(remaining_colors)
marker_ar = ["o", "P", "v", "*", "<", ">", "s", "p", "^", "h", "H", "+", "x", "X", "D", "d", "|", "_",1,2,3,4,5,6,7,8,9]
all_markers = list(mmarkers.MarkerStyle.markers.keys())
remaining_markers = list(set(all_markers) - set(marker_ar))
marker_ar.extend(remaining_markers)

marker_size = 30
y_axis = 'grad_norms'
x_axis = 'comms'
#x_axis = 'epochs'

preset = "unb_tuning"
#preset = "b_tuning"
preset = "biased_unbiased_comparison"
preset = "b_uniform"
#preset = "b_importance"
#preset = "uniform_vs_importance"
    
x_label = {'epochs':'Data passes','comms':'iterations' }[x_axis]
y_label = {'grad_norms':r"$\||\nabla f(x^k)\||^2$"}[y_axis]

dataset_ar = ["splice", "w8a", "a9a"]

exps = ["sgd-ind"]

#prb_ar = [0.01]
#prb = prb_ar[0]

factor_ar = np.array([1], dtype=float)

prb_ar = [0.01, 0.1, 0.5]
la_ar = [1.0]

main_title = preset

#freq = int(1/(prb*10))
freq = 1

fig, axs = plt.subplots(1, len(dataset_ar))
for (j,dataset) in enumerate(dataset_ar):
    print (f"dataset: {dataset}")
    list_param_tuples_dict = {}
    exp_ar_dict = {"sgd-ind":[]}
    label_ar_dict = exp_ar_dict.copy()     
    
    if "sgd-ind" in exps:
        if preset == "unb_tuning":
            list_param_tuples_dict["sgd-ind"] = non_local_methods_params(['unbiased'],['uniform'],[0.01],[0.25,0.5,1.0,2.0,4.0,8.0 ]) 
        elif preset == "b_tuning":
            factors = [32768.0, 65536.0, 131072.0, 262144.0, 524288.0, 1048576.0]
            list_param_tuples_dict["sgd-ind"] = non_local_methods_params(['biased'],['uniform'],[0.01], factors)
        elif preset == "biased_unbiased_comparison":
            factors = optimal_biased_factors[dataset]
            list_param_tuples_dict["sgd-ind"] = [('unbiased', 'uniform', 0.01, 1.0), ('biased', 'uniform', 0.01, 1.0), ('biased', 'uniform', 0.01, factors[0])]
            #print(list_param_tuples_dict["sgd-ind"])
        elif preset == "b_uniform":
            list_param_tuples_dict["sgd-ind"] = [('biased', 'uniform', 0.01, 1.0, 1.0), ('biased', 'uniform', 0.1, 1.0, 1.0), ('biased', 'uniform', 0.5, 1.0, 1.0)]
        elif preset == "b_importance":
            list_param_tuples_dict["sgd-ind"] = [('biased', 'importance', 0.01, 1.0, 1.0), ('biased', 'importance', 0.1, 1.0, 1.0), ('biased', 'importance', 0.5, 1.0, 1.0)]
        elif preset == "uniform_vs_importance":
            pass
        else:
            raise ValuError("wrong preset")
        

    else:
        raise ValueError("Wrong preset!")
        
    
    if "sgd-ind" in exps:
        #exp_ar_dict["sgd-ind"] = ["sgd-ind_{0}_{1}_{2}_{3}x".format(setting, prb_type, myrepr(prb), myrepr(factor)) for i, (setting, prb_type, prb, factor) in enumerate (list_param_tuples_dict["sgd-ind"])]
        #label_ar_dict["sgd-ind"] = ["SGD-{0}; {1}x".format(setting, intrepr(factor)) for i, (setting, prb_type, prb, factor) in enumerate (list_param_tuples_dict["sgd-ind"])]
        
        exp_ar_dict["sgd-ind"] = ["sgd-ind_{0}_{1}_{2}_{4}_{3}x".format(setting, prb_type, myrepr(prb), myrepr(factor), myrepr(la)) for i, (setting, prb_type, prb, factor, la) in enumerate(list_param_tuples_dict["sgd-ind"])]
        label_ar_dict["sgd-ind"] = [r"p={2}".format(setting, prb_type, prb, intrepr(factor), la) for i, (setting, prb_type, prb, factor, la) in enumerate(list_param_tuples_dict["sgd-ind"])]

    exp_keys, label_keys = list(exp_ar_dict.keys()),list(label_ar_dict.keys())
    assert(set(exp_keys) == set(label_keys))
    experiment_ar,label_ar = [],[] 
    for key in exp_keys:
        experiment_ar += exp_ar_dict[key]    
        label_ar += label_ar_dict[key]

    ub_x = 5_000
    tol = 1e+10

    ########
    draw = 1
    save = 1          
    save_separately = 0
    print_each_exp = 0
    cut_axis = 0
    dict_type_output = 0
    nan_investigate = 0
    df_generate = 0
    print_min = 0
    minimize_over = "grad_norms"
    #minimize_over = "its"
    #assert ((not nan_investigate) or (not print_min))
    #######
    its_ar, grad_norms_ar, its_last_value, grad_norms_last_value, is_ind_uploaded = load_all_logs (experiment_ar, dataset, project_path, ub_x, x_axis, y_axis)
    its_ar, grad_norms_ar = cut_logs_freq(its_ar, grad_norms_ar, freq)
    get_min_params(grad_norms_last_value, its_last_value, is_ind_uploaded, print_min, dict_type_output, label_ar, its_ar, grad_norms_ar, tol, x_axis, y_axis)
    df_its, df_norms, nan_inds = get_nan_dataframes(experiment_ar, grad_norms_last_value,its_ar,grad_norms_ar, nan_investigate, df_generate)
    non_nan_inds = np.setdiff1d (np.arange(len(experiment_ar)), nan_inds)
    non_nan_its_ar = [its_ar[i] for i in non_nan_inds]
    non_nan_grad_norms_ar = [grad_norms_ar[i] for i in non_nan_inds]
    non_nan_label_ar = [label_ar[i] for i in non_nan_inds]
    
    
    for i in range (len(its_ar)):
        x_shape = its_ar[i].shape[0]
        y_shape = grad_norms_ar[i].shape[0]
        if x_shape != y_shape:
            min_shape =  min(x_shape, y_shape)
            its_ar[i] = its_ar[i][:min_shape]
            grad_norms_ar[i] = grad_norms_ar[i][:min_shape]
        
        inds = np.arange (its_ar[i].shape[0])
        markers_on = inds[inds % (int(len(inds[:-(1 + 2 * i)]) / 10)) == 0].astype(int)
        
        #axs[j].plot(its_ar[i], grad_norms_ar[i], 'r', label=label_ar[i], color=color_dict[color_dict_keys[i]], marker=marker_dict[color_dict_keys[i]], markevery=list(markers_on), markersize=marker_size, markerfacecolor=color_dict[color_dict_keys[i]], markeredgecolor = 'black')
        axs[j].plot(its_ar[i], grad_norms_ar[i], 'r', label=label_ar[i], color=color_ar_1[i], marker=marker_ar[i], markevery=list(markers_on), markersize=marker_size, markerfacecolor=color_ar_1[i], markeredgecolor = 'black')

filename = "{0}_{1}.pdf".format(main_title, preset)
plot_path = get_plot_path(save_separately, project_path, dataset)
if not os.path.exists(plot_path):
    os.makedirs(plot_path)

dataset_titles = dataset_ar
plots_titles = {"unb_tuning":[dataset+ ";" + r" ${E}[|{S}|]=$"+f"{dataset_size_dict[dataset]*prb_ar[0]}"+ "; " + f"uniform-{prb_ar[0]}" for dataset in dataset_titles],
    "b_tuning":[dataset+ ";" + r" ${E}[|{S}|]=$"+f"{dataset_size_dict[dataset]*prb_ar[0]}"+ "; " + f"uniform-{prb_ar[0]}" for dataset in dataset_titles],
    "biased_unbiased_comparison":[dataset+ ";" + r" ${E}[|{S}|]=$"+f"{dataset_size_dict[dataset]*prb_ar[0]}"+ "; " + f"uniform-{prb_ar[0]}" for dataset in dataset_titles],
    "b_uniform":["BiasedSGD-ind; " + f"dataset: {dataset}; sampling: uniform" for dataset in dataset_titles],
    "b_importance":["BiasedSGD-ind; " + f"dataset: {dataset};" for dataset in dataset_titles],
    "uniform_vs_importance":None   
}[preset]
legend_location ="best"
fig_ax_ar = (fig, axs)
bottom = 1e-5

draw_merged_plots(fig_ax_ar, plot_path, filename, x_label, plots_titles, y_label, save, legend_location, ymin=bottom, ymax=None, xlim=None)
