In [5]:

import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))

from utils.model_utils.load_model import load_model
from utils.helper import ModelConfig
from utils.dataset_utils.load_dataset import load_data
import torch

In [6]:
model_name = "Salesforce/codet5-base-multi-sum"
task_type = "seq2seq"
architectures = "T5"
dataset_name = "Go"
num_labels = 6

In [7]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [8]:
checkpoint = None
model_config = ModelConfig(
    model_name=model_name,
    task_type=task_type,
    dataset_name=dataset_name,
    checkpoint=checkpoint,
    device=device,
)

In [9]:
model, tokenizer, checkpoint = load_model(model_config)

Directory /home/Minwoo/LESN/Decompose/DecomposeTransformer/Models/Configs/seq2seq/Salesforce/codet5-base-multi-sum exists.
Loading the model.
The model Salesforce/codet5-base-multi-sum is loaded.


In [11]:
train_dataloader, valid_dataloader, test_dataloader = load_data(
        model_config, batch_size=32
)

Loading the dataset Go
Load cached dataset.
The dataset Go is loaded


In [28]:
for idx, batch in enumerate(train_dataloader):
    input_ids = batch["input_ids"].to(device)
    labels = batch["labels"].to(device)
    print(tokenizer.decode(input_ids[0], skip_special_tokens=True))
    print(tokenizer.decode(labels[0], skip_special_tokens=True))
    outputs = model.generate(input_ids[0].unsqueeze(0))
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(decoded_output)
    break

func (s *CreateKeysAndCertificateInput) SetSetAsActive(v bool) *CreateKeysAndCertificateInput {
	s.SetAsActive = &v
	return s
}
// SetSetAsActive sets the SetAsActive field's value.
SetSetAsActive sets the SetAsActive field s value.


In [29]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
import torch
from scipy.stats import norm


class WeightRemover:
    def __init__(self, model, device="cuda:0", p=0.8):
        self.model = model.to(device)
        self.device = device
        self.p = p
        self.results = {"layer": [], "input": [], "output": []}

    def hook(self, layer, input, output):
        self.results["layer"].append(layer)
        self.results["input"].append(input[0].to('cpu'))
        self.results["output"].append(output[0].to('cpu'))

    def register_hooks(self):
        handle_list = []
        for layer in self.model.modules():
            if isinstance(layer, torch.nn.Linear):
                handle = layer.register_forward_hook(self.hook)
                handle_list.append(handle)
        return handle_list

    def remove_hooks(self, handle_list):
        for handle in handle_list:
            handle.remove()

    def remove_weights(self, layer):
        current_weight = layer.weight.clone()
        if layer.bias is not None:
            current_bias = layer.bias.clone()
        else:
            current_bias = None

        mean = torch.mean(current_weight, dim=1, keepdim=True)
        std = torch.std(current_weight, dim=1, keepdim=True)
        z_scores = (current_weight - mean) / std

        lower_z, upper_z = norm.ppf(0.45), norm.ppf(0.55)
        mask = torch.logical_and(z_scores >= lower_z, z_scores < upper_z)

        current_weight[mask] = 0
        all_zeros = ~mask.any(dim=1)
        if current_bias is not None:
            current_bias[all_zeros] = 0
        self.set_parameters(layer, current_weight, current_bias)

    def set_parameters(self, layer, weight, bias):
        layer.weight.data = weight
        if bias is not None:
            layer.bias.data = bias

    def process(self, input_tensor, decoder_input_ids):
        self.results = {"layer": [], "input": [], "output": []}
        handle_list = self.register_hooks()
        output = self.model(input_ids=input_tensor.to(self.device), decoder_input_ids=decoder_input_ids.to(self.device))
        self.remove_hooks(handle_list)
        return output
        
    def apply_removal(self):
        total_original_weights = 0
        total_remaining_weights = 0

        for idx, layer in enumerate(self.results["layer"]):
            current_weight = layer.weight
            original_non_zero_weights = torch.sum(current_weight != 0).item()
            total_original_weights += original_non_zero_weights

            if torch.sum(current_weight != 0) > torch.numel(current_weight) * self.p:
                self.results["output"][idx] = self.results["output"][idx].to(self.device)
                self.remove_weights(layer)
                self.results["output"][idx] = self.results["output"][idx].to('cpu')

            remaining_non_zero_weights = torch.sum(layer.weight != 0).item()
            total_remaining_weights += remaining_non_zero_weights

            print(f"Layer {idx} - Original non-zero weights: {original_non_zero_weights}, Remaining non-zero weights: {remaining_non_zero_weights}, Reduction: {original_non_zero_weights - remaining_non_zero_weights}")

        print(f"Total original non-zero weights: {total_original_weights}")
        print(f"Total remaining non-zero weights: {total_remaining_weights}")
        print(f"Total reduction: {total_original_weights - total_remaining_weights}")


In [None]:
class ConcernIdentification:
    def __init__(self, ref_model, model, device='cuda:0', p=0.7):
        self.ref_model = ref_model.to(device)
        self.model = model.to(device)
        self.device = device
        self.p = p
        self.original_results = {"layer": [], "input": [], "output": []}
        self.current_results = {"layer": [], "input": [], "output": []}

    def original_hook(self, layer, input, output):
        self.original_results["layer"].append(layer)
        self.original_results["input"].append(input[0].to('cpu'))
        self.original_results["output"].append(output[0].to('cpu'))

    def current_hook(self, layer, input, output):
        self.current_results["layer"].append(layer)
        self.current_results["input"].append(input[0].to('cpu'))
        self.current_results["output"].append(output[0].to('cpu'))

    def register_hooks(self, model, hook):
        handle_list = []
        for layer in model.modules():
            if isinstance(layer, torch.nn.Linear):
                handle = layer.register_forward_hook(hook)
                handle_list.append(handle)
        return handle_list

    def remove_hooks(self, handle_list):
        for handle in handle_list:
            handle.remove()

    def prune(self, ref_model, model, original_output, output):
        current_weight = model.weight.clone()
        if model.bias is not None:
            current_bias = model.bias.clone()
        else:
            current_bias = None
        original_weight = ref_model.weight.clone()
        
        if ref_model.bias is not None:
            original_bias = ref_model.bias.clone()
        else:
            original_bias = None
        shape = current_weight.shape

        output_loss = output - original_output
        if len(output_loss.shape) > len(shape):
            output_loss = output_loss[:, 0, :]
            
        positive_loss_mask = (
            torch.all(output_loss > 0, dim=0).unsqueeze(1).expand(-1, shape[1])
        )

        original_weight_std = safe_std(original_weight, dim=1, keepdim=True)
        current_weight_std = safe_std(
            current_weight,
            epsilon=original_weight_std,
            unbiased=True,
            dim=1,
            keepdim=True,
        )

        padded_positive = torch.where(
            current_weight > 0, current_weight, torch.tensor(float("nan"))
        )
        padded_negative = torch.where(
            current_weight < 0, current_weight, torch.tensor(float("nan"))
        )
        positive_mean = torch.nanmean(padded_positive, dim=1, keepdim=True)
        negative_mean = torch.nanmean(padded_negative, dim=1, keepdim=True)

        positive_std = safe_std(
            current_weight,
            epsilon=current_weight_std,
            unbiased=True,
            dim=1,
            keepdim=True,
        )
        negative_std = safe_std(
            current_weight,
            epsilon=current_weight_std,
            unbiased=True,
            dim=1,
            keepdim=True,
        )

        positive_scores = (padded_positive - positive_mean) / positive_std
        negative_scores = (padded_negative - negative_mean) / negative_std

        positive_median = torch.nanmedian(padded_positive, dim=1, keepdim=True)
        negative_median = torch.nanmedian(padded_negative, dim=1, keepdim=True)
        lower_z, upper_z = norm.ppf(0.1), norm.ppf(0.3)

        positive_remove_mask = torch.where(
            positive_mean < positive_median.values,
            positive_scores <= lower_z,
            torch.logical_and(positive_scores >= lower_z, positive_scores < upper_z),
        )

        negative_remove_mask = torch.where(
            negative_mean < negative_median.values,
            torch.logical_and(negative_scores < -lower_z, negative_scores >= -upper_z),
            negative_scores >= -upper_z,
        )

        remove_mask = torch.where(
            ~positive_loss_mask, positive_remove_mask, negative_remove_mask
        )

        current_weight[remove_mask] = 0

        all_zeros = ~remove_mask.any(dim=1)
        if current_bias is not None:
            current_bias[all_zeros] = 0
        self.set_parameters(model, current_weight, current_bias)

    def set_parameters(self, layer, weight, bias):
        layer.weight.data = weight
        if bias is not None:
            layer.bias.data = bias

    def process(self, input_tensor, decoder_input_ids):
        self.original_results = {"layer": [], "input": [], "output": []}
        self.current_results = {"layer": [], "input": [], "output": []}

        handle_list = self.register_hooks(self.model, self.current_hook)
        self.model(input_ids=input_tensor.to(self.device), decoder_input_ids=decoder_input_ids.to(self.device))
        self.remove_hooks(handle_list)
        handle_list = self.register_hooks(self.ref_model, self.original_hook)
        self.ref_model(input_ids=input_tensor.to(self.device), decoder_input_ids=decoder_input_ids.to(self.device))
        self.remove_hooks(handle_list)

    def apply_prune(self):
        total_original_weights = 0
        total_remaining_weights = 0
        
        for idx, layer in enumerate(self.current_results["layer"]):
            current_weight = layer.weight
            original_non_zero_weights = torch.sum(current_weight != 0).item()
            total_original_weights += original_non_zero_weights
            
            if torch.sum(current_weight != 0) > torch.numel(current_weight) * self.p:
                self.original_results["output"][idx] = self.original_results["output"][idx].to(self.device)
                self.current_results["output"][idx] = self.current_results["output"][idx].to(self.device)
                self.prune(self.original_results["layer"][idx], layer, self.original_results["output"][idx],
                           self.current_results["output"][idx])
                self.original_results["output"][idx] = self.original_results["output"][idx].to('cpu')
                self.current_results["output"][idx] = self.current_results["output"][idx].to('cpu')

            remaining_non_zero_weights = torch.sum(layer.weight != 0).item()
            total_remaining_weights += remaining_non_zero_weights

            print(f"Layer {idx} - Original non-zero weights: {original_non_zero_weights}, Remaining non-zero weights: {remaining_non_zero_weights}, Reduction: {original_non_zero_weights - remaining_non_zero_weights}")
            
        
        print(f"Total original non-zero weights: {total_original_weights}")
        print(f"Total remaining non-zero weights: {total_remaining_weights}")
        print(f"Total reduction: {total_original_weights - total_remaining_weights}")                


def safe_std(tensor, epsilon=None, unbiased=False, dim=None, keepdim=True):
    if tensor.numel():
        return nanstd(tensor, dim=dim, unbiased=unbiased, keepdim=keepdim)
    else:
        return torch.tensor(epsilon, dtype=tensor.dtype)


def nanstd(tensor, unbiased=False, dim=None, keepdim=True):
    mask = torch.isnan(tensor)
    n_obs = mask.logical_not().sum(dim=dim, keepdim=keepdim)
    mean = torch.nanmean(tensor, dim=dim, keepdim=keepdim)

    centered = tensor - mean
    centered = centered.masked_fill(mask, 0)
    sum_sq = torch.sum(centered ** 2, dim=dim, keepdim=keepdim)

    unbiased_factor = torch.where(n_obs > 1, n_obs - 1, n_obs)
    var = sum_sq / unbiased_factor

    std = torch.sqrt(var)
    if not keepdim:
        std = std.squeeze(dim)
    return std

In [None]:
weight_remover = WeightRemover(model, device, 0.8)

In [None]:
batch_size = 4
sequence_length = 10

for idx in range(5):
    print(f"-------------{idx} of the ids-------------")
    random_input_ids = torch.tensor(np.random.randint(0, tokenizer.vocab_size, (batch_size, sequence_length)), dtype=torch.long)
    random_decoder_input_ids = torch.tensor(np.random.randint(0, tokenizer.vocab_size, (batch_size, sequence_length)), dtype=torch.long)
    with torch.no_grad():
        y_ = weight_remover.process(random_input_ids, random_decoder_input_ids)
    weight_remover.apply_removal()

In [None]:
test(ref_model, text)

In [None]:
test(model, text)

In [None]:
import matplotlib.pyplot as plt

def parse_weight_data(data):
    epochs = []
    original_weights = []
    remaining_weights = []
    reductions = []

    pattern = re.compile(r'Total original non-zero weights: (\d+)\s+Total remaining non-zero weights: (\d+)\s+Total reduction: (\d+)')
    matches = pattern.findall(data)

    for i, match in enumerate(matches):
        epoch = i + 1
        original = int(match[0])
        remaining = int(match[1])
        reduction = int(match[2])

        epochs.append(epoch)
        original_weights.append(original)
        remaining_weights.append(remaining)
        reductions.append(reduction)

    return epochs, original_weights, remaining_weights, reductions

def plot_weight_changes(data):
    epochs, original_weights, remaining_weights, reductions = parse_weight_data(data)

    plt.figure(figsize=(10, 6))
    plt.plot(epochs, original_weights, label='Original Weights', marker='o')
    plt.plot(epochs, remaining_weights, label='Remaining Weights', marker='o')
    plt.plot(epochs, reductions, label='Reduction', marker='o')
    plt.xlabel('Epoch')
    plt.ylabel('Number of Weights')
    plt.title('Weight Changes per Epoch')
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
train_file = '../Datasets/Codes/python/train.jsonl'
valid_file = '../Datasets/Codes/python/valid.jsonl'
test_file = '../Datasets/Codes/python/test.jsonl'

In [None]:
import json

def load_code_snippets(file_path):
    code_snippets = []
    tokens = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line.strip())
            code_snippets.append(data['code_tokens'])
            tokens.append(data['docstring_tokens'])
    return code_snippets, tokens

In [None]:
train_snippets, train_tokens = load_code_snippets(train_file)
valid_snippets, valid_tokens = load_code_snippets(valid_file)
test_snippets, test_tokens = load_code_snippets(test_file)

In [None]:
len(train_snippets)

In [None]:
train_snippets[0]

In [None]:
ci = ConcernIdentification(ref_model, model, device='cuda:0', p=0.7)

In [None]:
for idx, (text, tokens) in enumerate(zip(train_snippets, train_tokens)):
    print(idx)
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
    print(f"-------------{idx} of the ids-------------")
    random_input_ids = input_ids
    random_decoder_input_ids = generated_ids
    with torch.no_grad():
        y_ = ci.process(text, tokens)
    ci.apply_prune()
    if idx > 5:
        break

In [None]:
test(model, text)

In [None]:
test(ref_model, text)