In [181]:
import tensorflow as tf
tf.keras.backend.set_floatx('float64')
from tensorflow import keras
import numpy.typing as npt
from importlib import reload
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
from pprint import pprint
import os

print("Numpy Version:", np.__version__)
print("Tensorflow Version:", tf.__version__)

from tqdm import tqdm
from typing import Dict, Generator, List, Tuple

from src.harness import architecture as arch
from src.harness import dataset as ds
from src.harness import meta
from src.harness import history as hist

from src.metrics.features import *
from src.metrics.synflow import compute_synflow_per_weight

Numpy Version: 1.26.4
Tensorflow Version: 2.17.0


In [189]:
def normalize(x: pd.Series) -> pd.Series:
    return (x - x.min()) / (x.max() - x.min())


def get_train_one_step() -> Callable:
    @tf.function
    def train_one_step(
        model: tf.keras.Model,
        masks: List[tf.Tensor],
        inputs: tf.Tensor,
        labels: tf.Tensor,
        optimizer: tf.keras.optimizers.Optimizer,
        loss_fn: tf.keras.metrics.Metric,
    ) -> float:
        with tf.GradientTape() as tape:
            predictions = model(inputs, training=True)
            loss = loss_fn(labels, predictions)

        gradients = tape.gradient(loss, model.trainable_weights)
        grad_mask_mul = []
        for grad_layer, mask in zip(gradients, masks):
            grad_mask_mul.append(tf.math.multiply(grad_layer, mask))
        optimizer.apply_gradients(
            zip(grad_mask_mul, model.trainable_weights))
        return loss

    return train_one_step

def build_weight_df_with_training(
    layer_df: pd.DataFrame,
    architecture: arch.Architecture,
    weights: List[npt.NDArray[np.float32]],
    masks: List[npt.NDArray[np.float32]],
    n: int = 1, 
    batch_size: int = 32,
    optimizer: tf.keras.optimizers.Optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4),
    loss_fn: tf.keras.losses.Loss = tf.keras.losses.CategoricalCrossentropy(),
) -> pd.DataFrame:
    # Training
    model = architecture.get_model_constructor()()
    model.set_weights([w * m for w, m in zip(weights, masks)])
    X_train, _, Y_train, _ = architecture.load_data()
    masks = list(map(lambda x: tf.convert_to_tensor(x, dtype=tf.float64), masks))
    X_train = tf.random.shuffle(X_train, seed=0)
    Y_train = tf.random.shuffle(Y_train, seed=0)
    
    @tf.function
    def do_train_steps() -> List[float]:
        losses = []
        for i in tqdm(range(n)):
            start = i * batch_size
            stop = start + batch_size
            inputs, labels = X_train[start:stop], Y_train[start:stop]
            train_one_step = get_train_one_step()
            losses.append(train_one_step(model, masks, inputs, labels, optimizer, loss_fn))
        return losses
    
    print("Training...")
    losses = do_train_steps()
    trained = model.get_weights()
    
    # Compute features
    weight_features_list = []
    start_idx = 0
    print("Making weight features with training")
    num_weights = sum(map(np.size, model.get_weights()))
    shape = [1] + list(model.input_shape[1:])
    t_synflow = compute_synflow_per_weight(model)
    
    # Duplicate features for initial vs. final weights
    for layer, (tw, m) in tqdm(enumerate(zip(trained, masks))):
        print(f"Layer {layer}")
        mask = m.numpy().astype(bool).ravel()
        num_params = len(mask)
        num_nonzero = np.count_nonzero(mask)
        num_zero = num_params - num_nonzero
        # Precompute values for this layer
        layer_num = np.full(num_params, layer, dtype=np.int8)
        weight_nums = np.arange(num_params, dtype=np.int32)

        t_flat = tw.flatten()
        t_sorted = np.sort(t_flat)
        t_sign = np.sign(t_flat)
        t_mag = np.abs(t_flat, dtype=np.float32)
        t_perc = np.array([np.argmax(v < t_sorted) - num_zero for v in t_flat]) / num_nonzero
        # Use std from initial weights assuming we did small amounts of training
        t_norm_std = (t_flat - layer_df["li_mean"].iloc[layer]) / layer_df["li_std"].iloc[layer]
         
        # Create a dictionary for weight features for this layer
        layer_weight_features = {
            "l_num": layer_num,
            "w_num": weight_nums,
            f"wt{n}_sign": t_sign,
            f"wt{n}_val": t_flat,
            f"wt{n}_mag": t_mag,
            f"wt{n}_perc": t_perc.astype(np.float32),
            f"wt{n}_std": t_norm_std.astype(np.float32),
            f"wt{n}_synflow": t_synflow[layer].numpy().flatten(),
            "w_mask": mask,
        }
        
        weight_features_list.append(pd.DataFrame(layer_weight_features))

    weight_df = pd.concat(weight_features_list, axis=0, ignore_index=True)
    
    keys = ["e_num", "t_num", "l_num"]
    if (key := "norm_wi_mag") not in merged_df.columns:
        weight_df[key] = weight_df.groupby(keys)["wi_mag"].transform(normalize)
    if (key := "norm_synflow") not in merged_df.columns:
        weight_df[key] = weight_df.groupby(keys)["wi_synflow"].transform(normalize)
    
    return weight_df

In [191]:
trained_wdf.head()

Unnamed: 0,l_num,w_num,wt100_sign,wt100_val,wt100_mag,wt100_perc,wt100_std,wt100_synflow,w_mask
0,0,0,-1.0,-0.063685,0.063685,0.064677,-1.484341,4.948735999999999e-19,True
1,0,1,1.0,0.001072,0.001072,0.497181,0.022486,4.089137e-22,True
2,0,2,-1.0,-0.036945,0.036945,0.241696,-0.862135,1.321771e-18,True
3,0,3,-1.0,-0.020769,0.020769,0.349188,-0.485728,0.0,True
4,0,4,-1.0,-0.019048,0.019048,0.36094,-0.445678,6.917019999999999e-19,True


In [192]:
wdf.head()

Unnamed: 0,l_num,w_num,wf_sign,wi_sign,wf_val,wi_val,wf_mag,wi_mag,wf_perc,wi_perc,wf_std,wi_std,w_mask,wf_synflow,wi_synflow,e_num,t_num
0,0,0,-1.0,-1.0,-0.03586,-0.063685,0.03586,0.063685,0.368172,0.072266,-0.27551,-0.836884,True,1.144758e-18,0.0,0,0
1,0,1,1.0,1.0,0.053931,0.001072,0.053931,0.001072,0.750361,0.506867,0.625495,1.252448,True,0.0,5.1111950000000005e-22,0,0
2,0,2,1.0,-1.0,0.065232,-0.036945,0.065232,0.036945,0.792908,0.251063,0.738902,1.515425,True,0.0,6.013991e-20,0,0
3,0,3,-1.0,-1.0,-0.023325,-0.020769,0.023325,0.020769,0.422258,0.359009,-0.149734,-0.545222,True,0.0,0.0,0,0
4,0,4,-1.0,-1.0,-0.069716,-0.019048,0.069716,0.019048,0.232657,0.370429,-0.615245,-1.62469,True,0.0,0.0,0,0


In [None]:
epath = "/users/j/b/jbourde2/lottery-tickets/experiments/11-04-2024/lenet_mnist_0_seed_5_experiments_1_batches_0.025_default_sparsity_lm_pruning_20241102-111614"
experiments = list(hist.get_experiments(epath))
e0 = experiments[0]
t0 = next(e0)
t0.seed_weights = lambda x: x

tdf, ldf, wdf = build_trial_dfs(t0, 0, 0)

In [190]:
trained_wdf = build_weight_df_with_training(
    ldf, 
    arch.Architecture(t0.architecture, t0.dataset), 
    t0.initial_weights,
    t0.masks,
    n=100,
    batch_size=64
)

Training...


100%|██████████| 100/100 [00:57<00:00,  1.75it/s]


Losses:
[2.230298603006082,
 2.3053508395794537,
 2.275178110098286,
 2.208902956778287,
 2.186161806067825,
 2.0944166448669366,
 2.0050227526189026,
 2.075310553995859,
 1.9836495647897308,
 1.9801935397815325,
 1.972452241723449,
 1.9442885349934083,
 1.887469338344589,
 1.816022684273078,
 1.8113223037462602,
 1.8031308834765007,
 1.7182051447774602,
 1.6096276986199278,
 1.6579361597120879,
 1.6602321224784737,
 1.479272697219519,
 1.6248499472272693,
 1.4124958408002937,
 1.3347886576121908,
 1.477865186036193,
 1.304608082980669,
 1.314242112637305,
 1.5095975022683503,
 1.2804871309528327,
 1.288843145380243,
 1.194197481541245,
 1.1868096026456576,
 1.260358833448425,
 1.0716644589277218,
 1.2438025900059848,
 1.0820792051675792,
 1.031371358947578,
 1.0747625418955977,
 0.957824997117374,
 0.8359280157597959,
 0.936098094692311,
 1.0919245467021486,
 0.8434738705767453,
 0.8631648917820764,
 0.9318111802309972,
 0.9907606580469555,
 0.7983439205969569,
 0.7071748766598337,
 1

0it [00:00, ?it/s]

Layer 0


  t_norm_std = (t_flat - layer_df["li_mean"].iloc[layer]) / layer_df["li_std"].iloc[layer]


Layer 1
Layer 2


  t_norm_std = (t_flat - layer_df["li_mean"].iloc[layer]) / layer_df["li_std"].iloc[layer]
6it [00:32,  5.36s/it]

Layer 3
Layer 4
Layer 5





In [None]:
df_path = "mnist_weightabase.pkl"
merged_df = pd.read_pickle(df_path)
# corrected_wdf = correct_class_imbalance(wdf)
# merged_df = merge_dfs(tdf, ldf, corrected_wdf)
# merged_df.to_pickle(df_path)

In [None]:
def normalize(x: pd.Series) -> pd.Series:
    return (x - x.min()) / (x.max() - x.min())

keys = ["e_num", "t_num", "l_num"]
if (key := "norm_wi_mag") not in merged_df.columns:
    merged_df[key] = merged_df.groupby(keys)["wi_mag"].transform(normalize)
if (key := "norm_synflow") not in merged_df.columns:
    merged_df[key] = merged_df.groupby(keys)["wi_synflow"].transform(normalize)
# if (key := "const_synflow") not in merged_df.columns:
#     merged_df[key] = merged_df["wi_synflow"].map(lambda x: 1 if x > 0 else 0)

merged_df.fillna(0, inplace=True)

Feature importance ovservations:

- Mask & sign directly tell the model what the outcome is (sign == 0 rather than +/- 1) which gets it perfectly
- The final measures for weights all get >93% accuracy, magnitude gets 98.14%
- Measures of current sparsity get pretty high (layer and overall sparsity both ~78%)
- The initial percentile a weight falls in gets 75% (wi_std gets much worse even though they are the same measure- perhaps the scale being between 0 and 1 makes it easier to train on?)
- All the OHE and initial weight magnitude measures get 57.85% accuracy
    - What is special about this number?
    - Why are the initial metrics (including magnitude) so uninformative?
        - Could a normalization scheme help this?
    - Why does "wi_std" do worse than random chance?

In [None]:
from copy import copy as shallowcopy
import numpy as np
import numpy.typing as npt
import tensorflow as tf
from tensorflow import keras
from typing import Callable, List, Tuple

def create_meta(shape: Tuple[int, ...]) -> keras.Model:
    model = keras.Sequential([
        keras.Input(shape=shape),
        keras.layers.Dense(8, "relu"),
        keras.layers.Dense(8, "relu"),
        keras.layers.Dense(8, "relu"),
        keras.layers.Dense(8, "relu"),
        keras.layers.Dense(8, "relu"),
        keras.layers.Dense(8, "relu"),
        keras.layers.Dense(8, "relu"),
        keras.layers.Dense(8, "relu"),
        
        keras.layers.Dense(1, "sigmoid"),
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
             loss=tf.keras.losses.BinaryCrossentropy(),
             metrics=["accuracy"])
    return model

def make_meta_mask(
    meta: keras.Model,
    make_x: Callable[[str, str, keras.Model, List[npt.NDArray]], npt.NDArray],
    architecture: str,
    dataset: str,
    steps: int,
) -> Tuple[List[npt.NDArray], List[float]]:
    a = arch.Architecture(architecture, dataset)
    _, val_X, _, val_Y = a.load_data()
    model = a.get_model_constructor()()
    original_weights = copy.deepcopy(model.get_weights())
    model.compile(optimizer="Adam", loss=tf.keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])
    masks = [np.ones_like(w) for w in model.get_weights()]
    
    def update_masks(mask_pred: npt.NDArray) -> List[npt.NDArray]:
        start = 0
        end = 0
        new_masks = []
        nonlocal masks
        for m in masks:
            end += m.size
            new_m = np.reshape(mask_pred[start:end], m.shape)
            new_masks.append(new_m)
            start = end
        return masks
            
    accuracies = []
    for step in range(steps):
        # Get validation accuracy
        _, accuracy = model.evaluate(val_X, val_Y)
        accuracies.append(accuracy)
        print(f"Step {step} accuracy: {accuracy:.2%}")
        # Extract features
        X = make_x(architecture, model, masks)
        # Predict and replace existing mask
        mask_pred = meta.predict(X, batch_size=2**20)
        masks = update_masks(mask_pred)
        model.set_weights([w * m for w, m in zip(original_weights, masks)])
        
    return masks, accuracies


def make_x(
    architecture: str,
    model: keras.Model,
    masks: List[npt.NDArray],
    train_batches: int = 0,
    batch_size: int = 32,
) -> npt.NDArray:
    # Layer features:
    # i_features = ["l_sparsity", "l_rel_size", "li_prop_positive", "wi_std", "wi_perc", "wi_synflow", "wi_sign", "dense", "bias", "conv", "output"]
    nparams = sum(map(np.size, masks))
    nfeatures = 11
    features = np.zeros((nparams, nfeatures))
    
    # Helper functions to add the unrolled weight values and
    # scalar layer values to the feature matrix
    n = 0
    def add_layer_features(layer_values: List[float]):
        nonlocal n
        start = 0
        end = 0
        for v, size in zip(layer_values, map(np.size, masks)):
            end += size
            features[start:end, n] = v
            start = end
        n += 1
        
    def add_weight_features(weight_features: List[npt.NDArray]):
        nonlocal n
        start = 0
        end = 0
        for v in weight_features:
            end += v.size
            features[start:end, n] = np.ravel(v)
            start = end
        n += 1
    
    # Make a separate copy to compute synflow for
    masked_weights = [w * m for w, m in zip(model.get_weights(), masks)]
    masked_model = shallowcopy(model)
    masked_model.set_weights(masked_weights)
    synflow_scores = [np.reshape(scores, -1) for scores in compute_synflow_per_weight(masked_model)]
    
    # Mask features
    sparsities = [np.count_nonzero(m) / np.size(m) for m in masks]
    rel_size = [np.size(m) / nparams for m in masks]
    prop_pos = [np.count_nonzero(w >= 0) for w in masks]
    
    # Layer type
    layer_ohe = arch.Architecture.ohe_layer_types(architecture)
    for values in [sparsities, rel_size, prop_pos]:
        add_layer_features(values)
    
    # Weight features
    l_std = [np.std(w) for w in masked_weights]
    l_mean = [np.mean(w) for w in masked_weights]
    l_sorted = [np.sort(np.ravel(w)) for w in masked_weights]
    
    w_std = [(w - l_mean) / l_std for w, l_mean, l_std in zip(l_std, l_mean, masked_weights)]
    w_sign = [np.sign(w) for w in masked_weights]
    num_nonzero = sum(map(np.count_nonzero, masks))
    num_zero = nparams - num_nonzero
    w_perc = np.array([
        np.argmax(np.ravel(v) < v_sorted) - num_zero 
        for v, v_sorted in zip(masked_weights, l_sorted)]
    ) / num_nonzero
    
    flat_masks = [np.ravel(m) for m in masks]
    for values in [w_std, w_perc, synflow_scores, w_sign]:
        add_weight_features(values)
    
    for values in [layer_ohe[:, i] for i in range(layer_ohe.shape[1])]:
        add_layer_features(values)
        
    return features


In [None]:
def summarize(model: keras.Model):
    summary_str = []
    model.summary(print_fn=lambda x: summary_str.append(x))
    return summary_str[0]

def best_subset(subsets: Dict) -> Tuple[Tuple[str], float]:
    best_key = None
    best_accuracy = None
    for key, value in subsets.items():
        if best_key is None or max(value["accuracy"]) > best_accuracy:
            best_key = key
            best_accuracy = max(value["accuracy"])
    return best_key, best_accuracy

In [None]:
# relu_1layer_8wide = relu_1layer
# relu_1layer_12wide = {}
# relu_1layer_16wide = {}
# relu_1layer_128wide = {}
# relu_8layer_8wide = {}
# relu_8layer_128wide = {}
variants = [
       ("1 layer, 8 wide", relu_1layer_8wide,),
       ("1 layer, 12 wide", relu_1layer_12wide,),
       ("1 layer, 16 wide", relu_1layer_16wide,),
       ("1 layer, 128 wide", relu_1layer_128wide,),
       ("8 layer, 8 wide", relu_8layer_8wide,),
       ("8 layer, 128 wide", relu_8layer_128wide,),
]
for variant, data in variants:
    print(variant)
    print(best_subset(data))
    print()

In [None]:
import matplotlib.ticker as mtick

# Takeaways (no training steps):
#     - Large diminishing marginal returns beyond a 1 layer 16 neuron ReLU network
def plot_feature_arch_subsets(variants: List[Tuple[str, Dict[Tuple[str], Dict]]]):
    plt.figure(figsize=(10, 8))
    plt.title("Architecture Sweep over Feature Subsets")
    plt.xlabel("Feature Subset")
    plt.ylabel("Max Accuracy (%)")
    plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0))
    for variant, data_dict in variants:
        categories = []
        accuracies = []
        for features, data in data_dict.items():
            categories.append(",\n".join(features))
            accuracies.append(max(data["accuracy"]))
        plt.scatter(categories, accuracies, label=variant)
    plt.legend()
    plt.grid()
    plt.ylim(bottom=0.75)
    plt.savefig("arch_feature_subsets_no_training.png")
    plt.show()

plot_feature_arch_subsets(variants)

In [None]:
# for variant, feature_subsets in variants:
#     filename = "_".join([v.strip(",") for v in variant.split()]) + ".json"
#     with open(filename, "w") as outfile:
#         to_save = {", ".join(key): value for key, value in feature_subsets.items()}
#         json.dump(to_save, outfile)

In [None]:
# Feature subsets
import json

# Feature scale is very important - normalized synaptic scores alone aren't better than random chance 
# but when added with a more informative feature (e.g., wi_perc) they add ~2% accuracy.
# Straight up synaptic flow scores are too low to make a meaningful difference
for variant, feature_subsets in variants:
    print("VARIANT:", variant)
    if feature_subsets:
        continue
    for features in relu_1layer_8wide:
        print(features)
        # features = ["norm_synflow", "wi_perc", "sparsity", "l_sparsity"]
        X, Y = featurize_db(merged_df, list(features))

        model = create_meta(X[0].shape)
        summary = summarize(model)
        epochs = 3
        batch_size = 256
        history = model.fit(X, Y, epochs=epochs, batch_size=batch_size, validation_split=0.2, shuffle=True)
        feature_subsets[tuple(features)] = {"epochs": epochs, "batch_size": batch_size, "accuracy": history.history["accuracy"], "summary": summary}

        filename = "_".join([v.strip(",") for v in variant.split()]) + ".json"
        with open(filename, "w") as outfile:
            to_save = {", ".join(key): value for key, value in feature_subsets.items()}
            json.dump(to_save, outfile)


In [None]:
# Accuracy of 57.57% is what can be achieved using random noise as a feature
features = ["norm_synflow"]
X, Y = featurize_db(merged_df, features)


rng = np.random.default_rng()
X = rng.standard_normal(size=X.shape)
model = create_meta(X[0].shape)
model.fit(X, Y, epochs=1, batch_size=256, validation_split=0.2, shuffle=True)