In [1]:
import os
import sys
import random
import argparse
import pandas as pd

import csv 
import json

import torch
import numpy as np
import torch.nn as nn 

from argparse import Namespace
from utils.dataset import getData
from utils.evaluation import evaluate
from transformers import AutoTokenizer, LlamaForCausalLM
import torch.nn.init as init
from utils.bag_of_words.projection_community import *

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def l_0_norm(vector):
    count = 0
    total = 0
    for element in vector:
        for sub_element in element:
            if sub_element != 0:
                count += 1
    return count
def take_average(dict):
    data = dict["0"]
    iterations_block = list(["0","1","2","3","4"])
    iterations_channel = list(["0","1","2","3","4"])
    #for style , iterations in zip (["block","channel","block_random","channel_random"],[iterations_block,iterations_channel,iterations_block,iterations_channel]):
    for style , iterations in zip (["block","channel"],[iterations_block,iterations_channel,iterations_block,iterations_channel]):
        for iter in iterations:
            if iter == "0":
                continue
            for ratio in dict[iter][style]:
                for dataset in dict[iter][style][ratio]:
                    for norm in dict[iter][style][ratio][dataset]:
                        value = np.array(dict[iter][style][ratio][dataset][norm])
                        if len( value.shape) != 1:
                            shape_model = value.shape
                        data[style][ratio][dataset][norm]= (np.array(data[style][ratio][dataset][norm])+value)
                        if iter == iterations[-1]:
                            data[style][ratio][dataset][norm] = data[style][ratio][dataset][norm]/len(iterations)
    return data, shape_model
def strip(name):
    name = name.split("/")[-1]
    name = name.split("_")[0]
    return name 

def loop_over(dict):
    if isinstance(dict, list):
        print("end")
    else: 
        print(dict.keys())
        for keys in dict:
            loop_over(dict[keys])
        
def get_dataset_list(dataset_list):
    dataname = []
    for data in dataset_list:
        if "subset" not in dataset_list[data].keys():
            dataname.append(data)
        else:
            for subset in dataset_list[data]["subset"]:
                dataname.append(subset)
    return dataname

def find_layers(module, layers=[nn.Linear], name=''):
    """
    Recursively find the layers of a certain type in a module.

    Args:
        module (nn.Module): PyTorch module.
        layers (list): List of layer types to find.
        name (str): Name of the module.

    Returns:
        dict: Dictionary of layers of the given type(s) within the module.
    """
    if type(module) in layers:
        return {name: module}
    res = {}
    for name1, child in module.named_children():
        res.update(find_layers(
            child, layers=layers, name=name + '.' + name1 if name != '' else name1
        ))
    return res

def create_distribution_llm_pruner(model):
    layers = model.model.layers
    distribution_2 = []
    count = 0 
    total_params = 0
    for i in range(len(layers)):
        layer = layers[i]
        subset = find_layers(layer)
        layer_values_2 = []
        for name in subset:
            W = subset[name].weight.data
            count += (W==0).sum().item()
            total_params += W.numel()
            layer_values_2.append(torch.linalg.matrix_norm(W, ord=float("Inf")).item()) #|W|_inf norm
        distribution_2.append(layer_values_2)
    return  np.array(distribution_2)


def get_model(model_name):
    if model_name == "llama":
        base_model = "meta-llama/Llama-2-7b-hf"
    elif model_name == "llama_chat":
        base_model = "meta-llama/Llama-2-7b-chat-hf"
    elif model_name == "vicuna":
        base_model = "lmsys/vicuna-7b-v1.5"
    torch.cuda.empty_cache()
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = LlamaForCausalLM.from_pretrained(
        base_model,
        low_cpu_mem_usage=True #if args.torch_version >=1.9 else False,
    )
    return model, tokenizer
def create_distribution_llm_pruner(model):
    layers = model.model.layers
    distribution_2 = []
    count = 0 
    total_params = 0
    for i in range(len(layers)):
        layer = layers[i]
        subset = find_layers(layer)
        layer_values_2 = []
        for name in subset:
            W = subset[name].weight.data
            count += (W==0).sum().item()
            total_params += W.numel()
            layer_values_2.append(torch.linalg.matrix_norm(W, ord=float("Inf")).item()) #|W|_inf norm
        distribution_2.append(layer_values_2)
    return  np.array(distribution_2)

def randomize_model(model, modules_list, alpha_scale=0.01):
    modules_to_reinit = [(int(m.split("_")[0]),m.split("_")[1]) for m in modules_list]
    layers = model.model.layers
    for idx_layer, module in modules_to_reinit:
        layer = layers[idx_layer]
        #print(idx_layer, module)
        for name1, child1 in layer.named_children():
            #print("Name1",name1)
            for name2, child2 in child1.named_children():
                #print("Name2",name2)
                if f"{name1}.{name2}" == f"{module}_proj" or f"{name1}.{name2}" == f"self_{module}_proj" :
                    # Loop over all parameters in the module and apply custom random initialization
                    for param in child2.parameters():
                        if param.requires_grad:  # Ensure the parameter is trainable
                            #std = param.std().item()
                            #print(std, end=", ")
                            noise = torch.randn_like(param) * alpha_scale # Small noise
                            param.data += noise
                            '''if param.dim() > 1:  # Initialize weights
                                init.kaiming_uniform_(param, a=0.01)
                            else:  # Initialize biases
                                init.constant_(param, 0)'''
    return model

def flatten_comprehension(matrix):
     return [item for row in matrix for item in row]


def get_all_dataset_list(dataset_info_list, dataset_list):
    dataname = []
    for d in dataset_list:
        for data in dataset_info_list:
            if "subset" not in dataset_info_list[data].keys():
                if  data == d:
                    dataname.append(data)
                    continue
            else:
                if d in dataset_info_list[data]["subset"]:
                    dataname.append([data,d])
                    continue
    return dataname

def get_high_low_datasets(community, top_skill= 50): 
    return community["dataset"]["all"][:top_skill], community["dataset"]["all"][-top_skill:]

def pick_largest_community(community_data_lists): 
    size = -1
    community_idx = None
    for comm_name, community in community_data_lists.items():
        if len(community["dataset"]) > size:
            size = len(community["dataset"])
            community_idx = comm_name
    non_idx_community = [idx for idx in community_data_lists if idx !=  community_idx]
    return (community_idx, size),community_data_lists[community_idx]["dataset"], flatten_comprehension([community_data_lists[non_idx]["dataset"] for non_idx in non_idx_community])

    



def get_modulesCommunityDataset(sparsity_ratio):
    with open("./dataset_info.json", 'r') as openfile:
        # Reading from json file
        dataset_info_list = json.load(openfile)
    dataset_list = get_dataset_list(dataset_info_list)
    #Original Distribution
    with open("result/original_distribution_vicuna_7b.json", 'r') as openfile:
        vicuna_original = json.load(openfile)
    with open("result/original_distribution_llama_7b.json", 'r') as openfile:
        # Reading from json file
        llama_original = json.load(openfile)
    with open("result/original_distribution_llama_7b-chat.json", 'r') as openfile:
        # Reading from json file
        llama_chat_original = json.load(openfile)
    #Pruned Distribution
    with open("result/distribution_llama_7b.json", 'r') as openfile:
        # Reading from json file
        llama_distribution = json.load(openfile)
    with open("result/distribution_vicuna_7b.json", 'r') as openfile:
        # Reading from json file
        vicuna_distribution = json.load(openfile)
    with open("result/distribution_llama_7b-chat.json", 'r') as openfile:
        # Reading from json file
        llama_chat_distribution= json.load(openfile)
    with open("result/dataMultidisciplinaryCognitiveSkillsFrameworkRestrict.json", 'r') as openfile:
        dataCategory = json.load(openfile)

    llama_distribution, _ = take_average(llama_distribution)
    vicuna_distribution, _ = take_average(vicuna_distribution)
    llama_chat_distribution, _ = take_average(llama_chat_distribution)
    
    distribution_dist = [llama_distribution,llama_chat_distribution,vicuna_distribution]
    original_dist = [llama_original,llama_chat_original,vicuna_original]    

    modules_community_dataset = create_projection_network(dataCategory,dataset_list, distribution_dist, original_dist, sparsity_ratio = sparsity_ratio, random_seed=True)
    
    return modules_community_dataset,dataset_info_list, dataset_list
    

In [2]:
def get_original_model(model_name):
    if model_name == "llama":
        with open("result/original_distribution_llama_7b.json", 'r') as openfile:
            org = json.load(openfile)
    elif model_name == "llama_chat":
        with open("result/original_distribution_llama_7b-chat.json", 'r') as openfile:
            # Reading from json file
            org = json.load(openfile)
    elif model_name == "vicuna":
        with open("result/original_distribution_vicuna_7b.json", 'r') as openfile:
            org = json.load(openfile)
    return org


# Model

In [None]:
i = 0
saved_file = pd.read_csv(f"./result/randomize_accuracy/randomize_data_new_{str(i)}.csv", index_col=0)
set_random_seed(int(i))
sparsity_ratio = "20"
data = {"iteration":[],"model":[],"pruning_style":[],"community":[],"pruning_ratio":[],"dataset":[],"accuracy":[],"rank":[],"modules":[],"alpha_scale":[]}
modules_community_dataset,dataset_info_list, dataset_list = get_modulesCommunityDataset(sparsity_ratio)
#"pruner_style","model","sparsity_ratio","community"
iteration = 0 
for idx, model_name in enumerate(modules_community_dataset["model"]):
    for alpha_scale in [0.1,0.075,0.05,0.025,0.01]:
        print(idx, model_name,alpha_scale, modules_community_dataset["pruner_style"][idx])
        community_data_lists = modules_community_dataset["community"][idx]
        community_data_lists["-1"] = None
        module_dataset = random.sample(dataset_list, 50)
        module_dataset_info_format = get_all_dataset_list(dataset_info_list, module_dataset)
        print("Random Dataset",module_dataset, flush=True)
        for comm_name, community in community_data_lists.items():
            print("Community Name:",comm_name)
            if comm_name == "-1":
                module_list = []
                print("Community Rank: Original Model", flush=True)
            else:
                #if not KL then no need for "all"
                print("Community Rank",modules_community_dataset["community"][idx][comm_name]["dataset"], flush=True)
                module_list =  community["modules"]
            #tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
            module_accuracy = []
            for dataset_name_label,dataset_name in zip(module_dataset, module_dataset_info_format):
                if comm_name == "-1":
                    rank = None
                else:
                    #if not KL then no need for "all"
                    rank = modules_community_dataset["community"][idx][comm_name]["dataset"].index(dataset_name_label)
                row_from_saved_file = saved_file[(saved_file["pruning_style"]==modules_community_dataset["pruner_style"][idx]) & (saved_file["model"]==model_name) & (saved_file["community"]==comm_name)& (saved_file["dataset"]==dataset_name_label)& (saved_file["alpha_scale"]==alpha_scale)]["accuracy"].tolist()
                if len(row_from_saved_file) == 1 and comm_name != "-1":
                    data["accuracy"].append(row_from_saved_file[0])
                else:
                    break
                data["iteration"].append(int(i))
                data["model"].append(model_name)
                data["pruning_style"].append(modules_community_dataset["pruner_style"][idx])
                data["community"].append(comm_name)
                data["pruning_ratio"].append(sparsity_ratio)
                data["dataset"].append(dataset_name_label)
                data["rank"].append(rank)
                data["modules"].append(module_list)
                data["alpha_scale"].append(alpha_scale)
                
                #print(data["accuracy"])
                iteration += 1
            print(module_list)
            print("Module Accuracy",comm_name,rank,module_accuracy, flush=True)
        print("++"*100)
        df = pd.DataFrame(data)
        print(df.head())
        print("++"*100)
        #df.to_csv(f'./result/randomize_accuracy/randomize_data_new_{i}.csv') 
#df = pd.DataFrame(data)
#df.to_csv(f'./result/randomize_accuracy/randomize_data_new_{i}.csv') 
print(iteration)

In [None]:
for i in range(5):
     set_random_seed(int(i))
     sparsity_ratio = "20"
     saved_file = pd.read_csv(f"./result/randomize_accuracy/randomize_data_new_{str(i)}.csv", index_col=0)
     #key_list =['iteration', 'model', 'pruning_style', 'community', 'pruning_ratio', 'dataset', 'accuracy', 'rank', 'modules', 'alpha_scale']
     key_list =['iteration', 'model', 'pruning_style', 'community', 'pruning_ratio', 'dataset', 'accuracy', 'rank_kl',"rank_network", 'modules']
     key_list_wo_rank =['iteration', 'model', 'pruning_style', 'community', 'pruning_ratio', 'dataset', 'accuracy', 'modules']
     data = {key:[] for key in key_list}
     modules_community_dataset,dataset_info_list, dataset_list = get_modulesCommunityDataset(sparsity_ratio)
     for index, row in saved_file.iterrows():
          for key in  key_list_wo_rank:
               data[key].append(row[key])

          dataset_name_label = row["dataset"]
          comm_name = row["community"]
          if row["model"] == "llama" and row["pruning_style"] == "block":
               idx, model_name = 0, "llama"
          elif row["model"] == "llama_chat" and row["pruning_style"] == "block":
               idx, model_name = 1, "llama_chat" 
          elif row["model"] == "vicuna" and row["pruning_style"] == "block":
               idx, model_name = 2, "vicuna"
          elif row["model"] == "llama" and row["pruning_style"] == "channel":
               idx, model_name = 3, "llama"
          elif row["model"] == "llama_chat" and row["pruning_style"] == "channel":
               idx, model_name = 4, "llama_chat" 
          elif row["model"] == "vicuna" and row["pruning_style"] == "channel":
               idx, model_name = 5, "vicuna"

          if comm_name == -1:
               data["rank_kl"].append(None)
               data["rank_network"].append(None)
          else:
               data["rank_network"].append(row["rank"])
               data["rank_kl"].append(modules_community_dataset["community"]["kl"][idx][comm_name]["dataset"]["all"].index(dataset_name_label))
               #data["rank_network"].append(modules_community_dataset["community"]["network"][idx][comm_name]["dataset"].index(dataset_name_label))
     df = pd.DataFrame(data)
     print(df.head())
     df.to_csv(f'./result/randomize_accuracy/randomize_data_new_{str(i)}.csv') 


In [8]:
import pandas as pd

all = pd.read_csv(f"/home/bhandk/MLNeuron/result/randomize_accuracy/randomize_data_new_kl_0.csv", index_col=0)
rand = pd.read_csv(f"/home/bhandk/MLNeuron/result/randomize_accuracy/randomize_data_new_kl_random_0.csv", index_col=0)
all["finetune"] = all["finetune"].fillna("None")
rand["finetune"] = rand["finetune"].fillna("None")
new_rand = pd.concat([all, rand], axis=0)
new_rand = new_rand.sort_values(["model","pruning_style","community","rank"])

new_rand.to_csv(f'./result/randomize_accuracy/randomize_data_new_kl_0.csv',index=False) 

In [None]:
i = 2
set_random_seed(int(i))
sparsity_ratio = "20"
#key_list =['iteration', 'model', 'pruning_style', 'community', 'pruning_ratio', 'dataset', 'accuracy', 'rank', 'modules', 'alpha_scale']
key_list =['iteration', 'model', 'pruning_style', 'community', 'pruning_ratio', 'dataset', 'accuracy', 'rank_kl',"rank_network", 'modules']
key_list_wo_rank =['iteration', 'model', 'pruning_style', 'community', 'pruning_ratio', 'dataset', 'accuracy', 'modules']
data = {key:[] for key in key_list}
modules_community_dataset,dataset_info_list, dataset_list = get_modulesCommunityDataset(sparsity_ratio)
print(modules_community_dataset)
for comm in modules_community_dataset["community"]["kl"][0]:
    print(comm,":")
    print("\t",modules_community_dataset["community"]["kl"][0][comm]["modules"])