<a href="https://colab.research.google.com/github/NevadaM/DCAI_ODI0524/blob/main/DCAI_project_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building a methodology for the Ablation Study project
Concept Note: [link](https://docs.google.com/document/d/1Zbz_6QnlUU9AOZcssJGPLFTX5IcSQT5Vn2wxKpTzkew/edit?usp=sharing)

remember to change runtime so that GPU is enabled

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.cifar10.load_data()
#normalize pixel values to be between 0 and 1
train_images, test_images = train_images / 255.0, test_images / 255.0

class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [3]:
train_images.shape

(50000, 32, 32, 3)

In [4]:
# to unlearn frogs (5)
frogs_train_images = []
frogs_train_labels = []
frogs_test_images = []
frogs_test_labels = []
safe_train_images = []
safe_train_labels = []
safe_test_images = []
safe_test_labels = []

for i in range(len(train_labels)):
  if train_labels[i] == 5:
    frogs_train_images.append(train_images[i])
    frogs_train_labels.append(train_labels[i])
  else:
    safe_train_images.append(train_images[i])
    safe_train_labels.append(train_labels[i])

for i in range(len(test_labels)):
  if test_labels[i] == 5:
    frogs_test_images.append(test_images[i])
    frogs_test_labels.append(test_labels[i])
  else:
    safe_test_images.append(test_images[i])
    safe_test_labels.append(test_labels[i])

frogs_train_images = np.array(frogs_train_images)
frogs_train_labels = np.array(frogs_train_labels)
frogs_test_images = np.array(frogs_test_images)
frogs_test_labels = np.array(frogs_test_labels)
safe_train_images = np.array(safe_train_images)
safe_train_labels = np.array(safe_train_labels)
safe_test_images = np.array(safe_test_images)
safe_test_labels = np.array(safe_test_labels)


In [5]:
safe_train_images.shape

(45000, 32, 32, 3)

In [6]:
## getting base model
image_shape = (32, 32, 3)

base_model = tf.keras.applications.ConvNeXtBase(
    input_shape=image_shape,
    include_top = False,
    weights='imagenet'
)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/convnext/convnext_base_notop.h5


In [7]:
base_model.summary()

Model: "convnext_base"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 32, 32, 3)]          0         []                            
                                                                                                  
 convnext_base_prestem_norm  (None, 32, 32, 3)            0         ['input_1[0][0]']             
 alization (Normalization)                                                                        
                                                                                                  
 convnext_base_stem (Sequen  (None, 8, 8, 128)            6528      ['convnext_base_prestem_normal
 tial)                                                              ization[0][0]']               
                                                                                      

In [8]:
#freezing base for now
base_model.trainable = False
base_model.trainable_variables

[]

In [9]:
# pooler to get top layer into a 1d tensor
global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

# prediction layer
prediction_layer = tf.keras.layers.Dense(10)

model = tf.keras.Sequential([
                             base_model,
                             global_average_layer,
                             prediction_layer
])

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 convnext_base (Functional)  (None, 1, 1, 1024)        87566464  
                                                                 
 global_average_pooling2d (  (None, 1024)              0         
 GlobalAveragePooling2D)                                         
                                                                 
 dense (Dense)               (None, 10)                10250     
                                                                 
Total params: 87576714 (334.08 MB)
Trainable params: 10250 (40.04 KB)
Non-trainable params: 87566464 (334.04 MB)
_________________________________________________________________


In [11]:
#config
base_learning_rate = 0.0005 #small rate so model isn't changed so much

model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=base_learning_rate),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics = ['accuracy'])

In [12]:
# #evaluating before fine tuning
loss0, accuracy0 = model.evaluate(frogs_test_images, frogs_test_labels)



In [14]:
#now training on images (this is only training the final layer)
history = model.fit(train_images, train_labels,
                    epochs=1)
                    # validation_data=eval_batches)

# print(acc)

  13/1563 [..............................] - ETA: 56:56 - loss: 2.2788 - accuracy: 0.1490

KeyboardInterrupt: 

In [None]:
# evalauting after fine tuning
loss1, accuracy1 = model.evaluate(frogs_test_images, frogs_test_labels)

In [None]:
#utils
placeholder_num = 100.0
target_dataset_features = tf.data.Dataset.from_tensor_slices(frogs_test_images).batch(batch_size=4)
target_dataset_labels = tf.data.Dataset.from_tensor_slices(frogs_test_labels).batch(batch_size=4)
safe_dataset_features = tf.data.Dataset.from_tensor_slices(safe_test_images).batch(batch_size=4)
safe_dataset_labels = tf.data.Dataset.from_tensor_slices(safe_test_labels).batch(batch_size=4)
unlearning_rate = 0.05
optimizer = tf.keras.optimizers.RMSprop(learning_rate=unlearning_rate)

loss_calc = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def get_grad(batch_features,  batch_labels, model):
  with tf.GradientTape() as tape:
    preds = model(batch_features)
    loss = loss_calc(batch_labels, preds)

  grads = tape.gradient(loss, model.trainable_variables)
  return grads


# def get_kl_grad(base_model, current_model, normal_batch):
#   with tf.GradientTape() as tape:
#     base_preds = base_model(normal_batch)
#     current_preds = current_model(normal_batch)

#     ## turning into probability distros
#     prob_p = tf.keras.activations.softmax(base_preds, axis=-1)
#     prob_q = tf.keras.activations.softmax(current_preds, axis=-1)

#     kl = tf.keras.losses.KLDivergence()(base_preds, current_preds)


#   grads = tape.gradient(kl, model.trainable_variables)

#   return kl


######################

def unlearn(model):
  num_steps = 20 #?
  base_model = model #normalisation requires KL from a benchmark model, keeping it the same for now

  bad_loss = 0.0 #good loss? target loss (loss #1 in eqn)
  index = 0

  while index < num_steps: #thresholds, stopping if loss is too big or we reach max steps
    for bad_batch_features, bad_batch_labels, normal_batch_features, normal_batch_labels in zip(target_dataset_features, target_dataset_labels, safe_dataset_features, safe_dataset_labels): #these should be batched, although worth pointing out Yao et al use batches of 4 only
      #### 1. TARGET LOSS ####
      bad_loss_grads = get_grad(bad_batch_features, bad_batch_labels, model) * -1

      #### 2. RANDOM MISMATCH ####
      ###ignore for now
      random_loss_grads = 0.0

      #### 3. KL DISTANCE TO NORMAL ####
      with tf.GradientTape() as tape:
        base_preds = model(normal_batch_features)
        current_preds = model(normal_batch_features)

        kl = tf.keras.losses.KLDivergence()(base_preds, current_preds)

      normal_loss_grads = tape.gradient(kl, model.trainable_variables)

      #### CALCULATE FINAL LOSS ####
      #loss = weights . losses. no weights for now tho
      grads = bad_loss_grads + normal_loss_grads # + random_loss

      #### BACK PROP ####
      optimizer.apply_gradients(zip(grads, model.trainable_variables))

      #### PRINT STATS AND DO CHECKPOINTING ####

      index += 1
      print(index)
      if index % 5 == 0:
        model.evaluate(frogs_test_images, frogs_test_labels)

  ### SAVE MODEL

  return model





MODEL PARAMS SHOULD BE UNFROZEN AT THIS POINT

In [None]:
base_model.trainable = True
# base_model.trainable_variables

In [None]:
lobotomised = unlearn(model)

## LLM TESTING

testing model: [OPT-6.7B](https://huggingface.co/facebook/opt-6.7b)

In [None]:
# !pip install Accelerate

In [None]:
# from google.colab import userdata
# import accelerate

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch

In [None]:
# model = AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b", torch_dtype=torch.float16, token=userdata.get('hf_key'), low_cpu_mem_usage=True, load_in_4bit=True).cuda()

In [None]:
# tokeniser = AutoTokenizer.from_pretrained("facebook/opt-6.7b", use_fast=False, token=userdata.get('hf_key'), low_cpu_mem_usage=True, padding_side='left')

In [None]:
# prompt = 'Tell then we don\'t want - " Harry could see Uncle Vernon\'s shiny black shoes pacing up and down the kitchen. "No, we\'ll ignore it. If they don\'t get an answer. ... Yes, that\'s b'

In [None]:
# input_ids = tokeniser(prompt, return_tensors="pt").input_ids.cuda()
# generated_ids = model.generate(input_ids, max_new_tokens=100, do_sample=True)
# print(tokeniser.batch_decode(generated_ids, skip_special_tokens=True)[0])

In [None]:
# # prompt = '''“And that’s where
# # ...” Mr. Ollivander
# # touched the
# # lightning scar on
# # Harry’s forehead
# # with a long, white
# # finger. “I’m sorry
# # to say I sold the
# # wand that did it,”
# # he said softly.
# # “Thirteen-and-a-half inc'''

# prompt = 'Among other public buildings in a certain town, which for many reasons it will be prudent to refrain from mentioning, and to which I will assign no fictitious name, there is one anciently common to most towns, great or small: to wit, a workhouse;'

In [None]:
# input_ids = tokeniser(prompt, return_tensors="pt").input_ids.cuda()
# generated_ids = model.generate(input_ids, max_new_tokens=100, do_sample=True)
# print(tokeniser.batch_decode(generated_ids, skip_special_tokens=True)[0])