### AUTHOR: Dimitri Kachler

# Global Parameters

In [1]:
#User-Dependent Variables
layerByLayer = False
datasetChoice = "MNIST"

# -------------- INACTIVE
#useNeptune = True

# Imports

In [2]:
# Neural Networks
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.optim.optimizer import Optimizer, required
import torch

# Arrays & Mathematics
import math
import numpy as np

#Plotting
import matplotlib.pyplot as plt
import pandas as pd

#System / IO
import abc
import itertools
import importlib

#Data Visualization
#import seaborn as sns

#External Utilities
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR



In [3]:
# CUDA Check
print(torch.__version__)
device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using {device} device")

2.3.0+cu121
Using cpu device


In [4]:
# NOTE: you can still run this and it should still work and send the data to my Neptune.ai project,
# unfortunately you won't be able to see the graph without my account
# Capture makes it so that the cell doesn't output text
%%capture
#
try:
    import neptune
except ImportError as e:
    %pip install -U neptune
#import neptune
from getpass import getpass

project="dimitri-kachler-workspace/sanity-MNIST"
api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlNWQxNDllOS04OGY1LTRjM2EtYTczZi0xNWI0NTRmZTA1OTEifQ=="
#project = neptune.init_project(api_token=api_token, project=project)

# Github Imports

In [5]:
!git clone https://github.com/NanoNero1/IHT_AGD

fatal: destination path 'IHT_AGD' already exists and is not an empty directory.


In [6]:
%cd /content/IHT_AGD/
!git pull

/content/IHT_AGD
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (1/1), done.[K
remote: Total 6 (delta 5), reused 6 (delta 5), pack-reused 0[K
Unpacking objects: 100% (6/6), 481 bytes | 160.00 KiB/s, done.
From https://github.com/NanoNero1/IHT_AGD
   428d660..d1b413f  main       -> origin/main
Updating 428d660..d1b413f
Fast-forward
 optimizers/ihtSGD.py     | 2 [32m+[m[31m-[m
 optimizers/vanillaAGD.py | 2 [32m+[m[31m-[m
 optimizers/vanillaSGD.py | 2 [32m+[m[31m-[m
 3 files changed, 3 insertions(+), 3 deletions(-)


In [7]:
# Data Collection
import IHT_AGD.data_loaders.dataLoaders as dataLoaders
datasetChoice = dataLoaders.datasetChoice
train_loader = dataLoaders.train_loader
test_loader = dataLoaders.test_loader

# Just for debugging
#import IHT_AGD.architectures.architect
#IHT_AGD.architectures.architect.seeVariable

# Neural Netwok Architecture
from IHT_AGD.architectures.convNets import MNIST_convNet

# Taining and Testing Functions
from IHT_AGD.modelTrainTest.trainingMetrics import getTestAccuracy,getTestLoss
from IHT_AGD.modelTrainTest.trainLoop import train

# Optimizers (base, SGD, AGD, IHT, etc.)
from IHT_AGD.optimizers.baseOptimizer import myOptimizer
from IHT_AGD.optimizers.vanillaSGD import vanillaSGD
from IHT_AGD.optimizers.ihtSGD import ihtSGD
from IHT_AGD.optimizers.vanillaAGD import vanillaAGD
from IHT_AGD.optimizers.ihtAGD import ihtAGD
from IHT_AGD.optimizers.nativePytorchSGD import dimitriPytorchSGD

# Visualization Functions
from IHT_AGD.visualizationGraphs.plotting import plotMetric

# Experiment Functions
from IHT_AGD.experimentScaffolding.chooseOptimizer import chooseOptimizer
from IHT_AGD.experimentScaffolding.chooseOptimizer import fixedChooseOptimizer
from IHT_AGD.experimentScaffolding.experimentFuncs import runOneExperiment
from IHT_AGD.experimentScaffolding.experimentFuncs import runMainExperiment
from IHT_AGD.experimentScaffolding.experimentFuncs import runPipeline

<Figure size 3000x1400 with 0 Axes>

In [8]:
#To know the sizes
firstInput, firstTarget = next(iter(train_loader))
print(firstInput.size())

torch.Size([1000, 1, 28, 28])


# Tracking

In [9]:
variablesToTrack = ['sparsity','sparsityBias','lr','iteration','trackSparsity','trackSparsityBias','trackSparsityLinear','testAccuracy','beta']
functionsToHelpTrack = ['trackingSparsity']#,'getTestAccuracy']

# Setups

In [10]:
# NOTE: I think it might be useful to keep the setups here, at least for now since we change the settings often
setup_ihtAGD = {
    "scheme":"ihtAGD" ,
    "sparsity":0.99,
    "kappa":5.0,
    "beta":50.0}

setup_vanillaAGD = {
    "scheme":"vanillaAGD",
    "sparsity":0.950,
    "kappa":5.0,
    "beta":50.0,
    }

setup_ihtSGD = {
    "scheme":"ihtSGD" ,
    "sparsity":0.950,
    "beta": 50.0,}

setup_vanillaSGD = {
    "scheme":"vanillaSGD",
    "sparsity":0.9,
    "beta": 50.0,
}

setup_pytorchSGD = {
    "scheme":"pytorchSGD"
}

# Running the Experiment

In [11]:
print(datasetChoice)

MNIST


In [12]:
""" MAIN CELL """
setups = [setup_ihtAGD]#,setup_vanillaSGD]#,setup_ihtAGD]
#setups = [setup_pytorchSGD]


run = neptune.init_run(api_token=api_token, project=project)
runPipeline(setups,
            datasetChoice="MNIST",
            epochs=6,trials=1,
            functionsToHelpTrack=functionsToHelpTrack,
            variablesToTrack=variablesToTrack,
            device=device,
            run=run,
            test_loader=test_loader,
            train_loader=train_loader)
run.stop()



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/dimitri-kachler-workspace/sanity-MNIST/e/SAN-488
{'functionsToHelpTrack': ['trackingSparsity'], 'variablesToTrack': ['sparsity', 'sparsityBias', 'lr', 'iteration', 'trackSparsity', 'trackSparsityBias', 'trackSparsityLinear', 'testAccuracy', 'beta'], 'device': 'cpu', 'run': <neptune.metadata_containers.run.Run object at 0x77fb828cfc10>, 'test_loader': <torch.utils.data.dataloader.DataLoader object at 0x77fb82a6b580>, 'train_loader': <torch.utils.data.dataloader.DataLoader object at 0x77fb82a6a800>}
[{'epochs': 6, 'scheme': 'ihtAGD', 'sparsity': 0.99, 'kappa': 5.0, 'beta': 50.0}]
0
test fixed chooose
{'model': MNIST_convNet(
  (conv1): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=4608, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
), 'scheme': 'ihtAGD', 'sparsity': 0.99, 'kappa': 5.0, 'beta': 50.0, 'functionsToHelpTrack': ['trackingS

        Convert the value to a supported type, such as a string or float, or use stringify_unsupported(obj)
        for dictionaries or collections that contain unsupported values.
        For more, see https://docs.neptune.ai/help/value_of_unsupported_type


tensor(2.2929, grad_fn=<NllLossBackward0>)
speed iteration 1
HowFarAlong: 31 / 40
Iteration: 1
warmup
FIXED IHT-AGD
tensor(2.2561, grad_fn=<NllLossBackward0>)
speed iteration 2
HowFarAlong: 32 / 40
Iteration: 2
warmup
FIXED IHT-AGD
tensor(2.1872, grad_fn=<NllLossBackward0>)
speed iteration 3
HowFarAlong: 33 / 40
Iteration: 3
warmup
FIXED IHT-AGD
tensor(2.1102, grad_fn=<NllLossBackward0>)
speed iteration 4
HowFarAlong: 34 / 40
Iteration: 4
warmup
FIXED IHT-AGD
tensor(2.0122, grad_fn=<NllLossBackward0>)
speed iteration 5
HowFarAlong: 35 / 40
Iteration: 5
warmup
FIXED IHT-AGD
tensor(1.9048, grad_fn=<NllLossBackward0>)
speed iteration 6
HowFarAlong: 36 / 40
Iteration: 6
warmup
FIXED IHT-AGD
tensor(1.7893, grad_fn=<NllLossBackward0>)
speed iteration 7
HowFarAlong: 37 / 40
Iteration: 7
warmup
FIXED IHT-AGD
tensor(1.6517, grad_fn=<NllLossBackward0>)
speed iteration 8
HowFarAlong: 38 / 40
Iteration: 8
warmup
FIXED IHT-AGD
tensor(1.5238, grad_fn=<NllLossBackward0>)
speed iteration 9
HowFarAlong

KeyboardInterrupt: 

In [None]:
importlib.reload(IHT_AGD.experimentScaffolding.chooseOptimizer)

# Plotting

In [None]:
# Changing the theme to be more pleasing
plt.style.use("fivethirtyeight")

#project="dimitri-kachler-workspace/sanity-MNIST"

In [None]:
plotMetric(runID="SAN-441",metricName="loss",methodNames=["iht_AGD"],trials=1)

# -----------------------------------------------------------------------
# END OF THE BASELINE FRAMEWORK, NEXT SECTION DEDICATED TO EXTENSIONS


## Bias Left Untouched

In [None]:
class untouchedIhtAGD(ihtAGD):
  def __init__(self,params,sparsity=0.9,kappa=5.0,beta=50.0):
    super().__init__(params)
    self.methodName = "untouched_iht_AGD"
    self.alpha = beta / kappa
    self.beta = beta
    self.kappa = kappa

  def sparsify(self):
    # TO-DO: remember to remove this zero, it is inconsequential, but still remove it in good practice
    concatWeights = torch.zeros((1)).to(device)
    for group in self.param_groups:
      for p in group['params']:

        #Skip Bias Layers
        if len(p.data.shape) < 2:
          continue

        flatWeights = torch.flatten(p.data)
        concatWeights = torch.cat((concatWeights,flatWeights),0)

    topK = int(len(concatWeights)*(1-self.sparsity))
    vals, bestI = torch.topk(torch.abs(concatWeights),topK,dim=0)
    cutoff = vals[-1]
    for group in self.param_groups:
      for p in group['params']:

        #Skip Bias Layers
        if len(p.data.shape) < 2:
          continue

        p.data[abs(p.data) <= cutoff] = 0.0




In [None]:
setup_untouched_ihtAGD = {
    "scheme":"untouchedIhtAGD",
    "lr":0.1,
    "sparsity":0.90,
    "kappa":10.0,
    "beta":100.0}
setups = [setup_untouched_ihtAGD, setup_ihtAGD]

run = neptune.init_run(api_token=api_token, project=project)
all_models,all_training_losses,all_testing_losses,all_accuracies = runMainExperiment(setups)
run.stop()

# Grid Search

In [None]:
from os import setgroups

def gridSearch(default,variables,values,metric,epochs=1):
  """ Desc: searches in a grid for the best combination of values of arbitrary dimension,
        we can check for more than 2 variables at a time, but this can be very costly

  default [dictionary]: a dictionary for all the default settings, this is also how one can set the type of algorithm
  variables [array[string]]: the settings to change
  values [2Darray]: what values to take on
  metric [string]: what metric to use for the best value
  """

  # We will not know how to traverse this list easily however
  # TO-DO: find a way to organize, or traverse this list
  setups = []

  # This list has every possible combination of the settings
  valuePermutations = list(itertools.product(*values))

  for permutation in valuePermutations:
    newSetup = default.copy()
    for idx,val in enumerate(permutation):

      # Adjusts the settings one-by-one
      newSetup[variables[idx]] = val

    setups.append(newSetup)

  print(setups)


  all_models,all_training_losses,all_testing_losses,all_accuracies = runMainExperiment(setups,epochs=epochs)

  # NEXT: Interchange with a different metric
  # TO-DO: try "highest loss" over entire dataset using model

  # Right now we use the accuracy in after the last epoch
  # BUG: is the last epoch at 0 or -1 I need to check
  min_accuracies = [accuracies[-1] for accuracies in all_accuracies]
  bestSetupIndex = min_accuracies.index(min(min_accuracies))



  return setups[bestSetupIndex]

In [None]:
default = {
    "scheme":"vanillaAGD",
    "lr":0.1,
    "sparsity":0.90,
    "kappa":15.0,
    "beta":10000.0}
# We set a big value to see if we overwrite it in the Grid Search

gridSearch(default,["kappa","beta"],[[2.0,10.0,100.0],[10.0,100.0,300.0]],"loss",5)

In [None]:
#This works! It recognizes it as a class name
type(eval("ihtAGD"))

# **Appendix**

# Saving and Loading Model

SOURCE: https://pytorch.org/tutorials/beginner/saving_loading_models.html

In [None]:
def saveModel(model,pathdir):
  torch.save(model.state_dict(), pathdir)

def loadModel(pathdir,modeltype):
  match modeltype:
    case "basicNeuralNet": model = basicNeuralNet(784,10).to(device)
    case "convNet": model = convNet().to(device)

  model.load_state_dict(torch.load(pathdir))
  model.eval()
  return model

In [None]:
saveModel(all_models[0],"testModel")

In [None]:
tryModel = loadModel("testModel","convNet")

# Notes

Sparsify Interval
Base case
Fine-Tuning Phase (Freeze weights) , < Sparsify interval
Real-time visualization - add trainin loss per batch and test loss, and test accuracy
Weights and Biases


AC/DC proof 8.1.4,

Make proof on board work for large numbers, i.e.! T:(S* times Kappa^2 * some constant factor)
Want the damage to be 1 + epsilon

make sure you can collect useful information - e.g. things like sparsity




# Empirically Testing the Model

In [None]:
def testModel(model):
  randomExampleInt = np.random.randint(1000)
  exampleX = dataset2.data[randomExampleInt].reshape(28, 28)
  plt.imshow(exampleX)
  print(exampleX.shape)
  exampleX = torch.reshape(exampleX, (1, 1,28,28))
  predicted = model(torch.tensor(exampleX,dtype=torch.float32).to(device))
  print(torch.argmax(predicted))

testModel(tryModel)

# TO DO

- check the sparsity of bias persists if we increase sparsity (95% sparsity and 99%)

- compare with and without first phase I added

- try to see if same spike appears with inserting SGD on decompression

- Visualization?
-  Save Model? - MAYBE USEFUL
-  Checkpoints? - YES DO THIS!