# Init

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0,2"



In [2]:
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_int8_training,
)
from peft import PeftModel

In [3]:
import os
import argparse

parser = argparse.ArgumentParser()

# Adding optional argument
parser.add_argument("-c", "--CombinationIdx", type=int, help="Set idx of c to use")
parser.add_argument("-q", "--quantization", action="store_true")
parser.add_argument("--lora_r", type=int, default=8, help="Set LoRA r value")
parser.add_argument(
    "--model_size", type=int, default=7, help="Llama 2 size: 7, 13, or 70"
)
parser.add_argument(
    "--toPredict", default="Target", help="Target vs Source"
)
parser.add_argument(
    "--gpu",
    type=str,
    default="0",
    help="On which GPU to run",
)
parser.add_argument(
    "--nTest",
    type=int,
    default=200,
    help="Number of testing samples",
)
args = parser.parse_args(args=["--model_size=7", "-c=1355", '--lora_r=8', "--toPredict=Target", "--gpu=2", "--nTest=500","--quantization"])

In [4]:
args

Namespace(CombinationIdx=1355, quantization=True, lora_r=8, model_size=7, toPredict='Target', gpu='2', nTest=500)

In [5]:

import sys
import itertools
from tqdm.auto import tqdm
import pathlib
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score

import datasets
from contextlib import nullcontext
import torch
from torch import nn
from transformers import (
    Trainer,
    TrainingArguments,
    LlamaTokenizer,
    LlamaForSequenceClassification,
    TrainerCallback,
    default_data_collator,
)


In [6]:
class train_config:
    def __init__(self):
        self.quantization: bool = False

        


globalconfig = train_config()
globalconfig.quantization = args.quantization
globalconfig.model_id = f"/bime-munin/llama2_hf/llama-2-{args.model_size}b_hf/"

# Load Model

In [7]:
target_model_id = "/bime-munin/xiruod/llama2_SHAC/n500/set-1355-quantization-epoch3-llama-2-7B-loraR-8"
source_model_id = "/bime-munin/xiruod/llama2_SHAC/n500/Source-set-1355-quantization-epoch3-llama-2-7B-loraR-8"



In [8]:
##### Tokenizer
tokenizer = LlamaTokenizer.from_pretrained(f"/bime-munin/llama2_hf/llama-2-7b_hf/")

tokenizer.add_special_tokens({"pad_token": "<pad>"})

1

In [9]:
model = LlamaForSequenceClassification.from_pretrained(globalconfig.model_id, device_map='auto', load_in_8bit=args.quantization, torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /bime-munin/llama2_hf/llama-2-7b_hf/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.config.pad_token_id = tokenizer.pad_token_id

model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=128)


Embedding(32128, 4096)

# Load Adapter and Manipulation

In [11]:
model = PeftModel.from_pretrained(model, target_model_id, adapter_name='target')

In [None]:

model.load_adapter(source_model_id, adapter_name="source")

## Play with Adapter Manipulation

In [13]:
import math

In [14]:
import peft

In [15]:
from peft.utils import (
    # TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
    # ModulesToSaveWrapper,
    _freeze_adapter,
    _get_submodules,
    # get_quantization_config,
)

In [16]:
from peft.tuners.lora import LoraLayer

In [17]:
from dataclasses import asdict, replace
from functools import reduce
import operator

In [18]:
adapter_name = 'delta'
adapters = ['target', 'source']

adapters_ranks = [model.peft_config[adapter].r for adapter in adapters]
new_rank = adapters_ranks[0]

target_module_types = [type(model.peft_config[adapter].target_modules) for adapter in adapters]
if target_module_types[0] == str:
    new_target_modules = "|".join(f"({model.peft_config[adapter].target_modules})" for adapter in adapters)
elif target_module_types[0] == set:
    new_target_modules = reduce(
        operator.or_, (model.peft_config[adapter].target_modules for adapter in adapters)
    )

In [19]:
new_target_modules

{'q_proj', 'v_proj'}

In [20]:
model.peft_config[adapter_name] = replace(
            model.peft_config[adapters[0]],
            r=new_rank,
            lora_alpha=new_rank,
            target_modules=new_target_modules,
        )
model.inject_adapter(model.model, adapter_name)

# Do we really need that?
_freeze_adapter(model.model, adapter_name)

In [21]:
key_list = [key for key, _ in model.model.named_modules() if model.prefix not in key]
for key in key_list:
    _, target, _ = _get_submodules(model.model, key)
    if isinstance(target, LoraLayer):
        if adapter_name in target.lora_A:
            target_lora_A = target.lora_A[adapter_name].weight
            target_lora_B = target.lora_B[adapter_name].weight
        elif adapter_name in target.lora_embedding_A:
            target_lora_A = target.lora_embedding_A[adapter_name]
            target_lora_B = target.lora_embedding_B[adapter_name]
        else:
            continue
            
        target_lora_A.data = target_lora_A.data * 0.0
        target_lora_B.data = target_lora_B.data * 0.0
        
        target_lora_A.data = target.lora_A['target'].weight - target.lora_A['source'].weight
        target_lora_B.data = target.lora_B['target'].weight - target.lora_B['source'].weight

In [22]:
from torch.linalg import vector_norm

In [23]:
# Normalized Vector version
# Normalized to the Target size

key_list = [key for key, _ in model.model.named_modules() if model.prefix not in key]
for key in key_list:
    _, target, _ = _get_submodules(model.model, key)
    if isinstance(target, LoraLayer):
        if adapter_name in target.lora_A:
            target_lora_A = target.lora_A[adapter_name].weight
            target_lora_B = target.lora_B[adapter_name].weight
        elif adapter_name in target.lora_embedding_A:
            target_lora_A = target.lora_embedding_A[adapter_name]
            target_lora_B = target.lora_embedding_B[adapter_name]
        else:
            continue
            
        target_lora_A.data = target_lora_A.data * 0.0
        target_lora_B.data = target_lora_B.data * 0.0
        
        target_lora_A.data = target.lora_A['target'].weight - target.lora_A['source'].weight
        target_lora_B.data = target.lora_B['target'].weight - target.lora_B['source'].weight
        
        vt_oA = vector_norm(target_lora_A.data, dim=0)
        vt_tA = vector_norm(target.lora_A['target'].weight, dim=0)
        target_lora_A.data = target_lora_A.data / vt_oA * vt_tA
        
        vt_oB = vector_norm(target_lora_B.data, dim=1)
        vt_tB = vector_norm(target.lora_B['target'].weight, dim=1)
        target_lora_B.data = (target_lora_B.data.T / vt_oB * vt_tB).T


In [35]:
target.lora_B['target'].weight.shape

torch.Size([4096, 8])

In [36]:
vector_norm(target.lora_B['target'].weight, dim=1)

tensor([0.0022, 0.0010, 0.0026,  ..., 0.0020, 0.0021, 0.0019], device='cuda:0',
       grad_fn=<LinalgVectorNormBackward0>)

In [25]:
target.lora_A['target'].weight.shape

torch.Size([8, 4096])

In [37]:
vector_norm(target.lora_A['target'].weight, dim=0)

tensor([0.0241, 0.0349, 0.0288,  ..., 0.0311, 0.0282, 0.0290], device='cuda:0',
       grad_fn=<LinalgVectorNormBackward0>)

In [67]:
vector_norm(target.lora_B['target'].weight, dim=1)

tensor([0.0022, 0.0010, 0.0026,  ..., 0.0020, 0.0021, 0.0019], device='cuda:0',
       grad_fn=<LinalgVectorNormBackward0>)

In [62]:
tmp = target.lora_B['target'].weight - target.lora_B['source'].weight
vt_oB = vector_norm(tmp, dim=1)
vt_tB = vector_norm(target.lora_B['target'].weight, dim=1)

tmp = (tmp.T / vt_oB * vt_tB).T

In [65]:
vector_norm(tmp,dim=1)

tensor([0.0022, 0.0010, 0.0026,  ..., 0.0020, 0.0021, 0.0019], device='cuda:0',
       grad_fn=<LinalgVectorNormBackward0>)

In [63]:
target.lora_B['target'].weight

Parameter containing:
tensor([[-1.0705e-04,  1.0838e-03, -6.7676e-04,  ...,  1.0237e-03,
          4.4704e-04, -9.1145e-04],
        [ 2.0554e-04,  6.5568e-04, -6.2078e-04,  ...,  7.2634e-05,
         -3.2010e-04, -1.6794e-04],
        [ 3.5568e-04, -6.8224e-04,  2.7409e-04,  ..., -2.6225e-05,
         -1.1100e-04,  1.3575e-03],
        ...,
        [ 8.8527e-04,  6.5093e-04, -3.5329e-04,  ...,  4.9425e-04,
         -3.9447e-04,  8.5558e-04],
        [ 1.6238e-04, -1.4190e-03,  1.0853e-03,  ..., -2.4651e-04,
         -2.8751e-05, -5.8329e-04],
        [ 9.7082e-04,  7.3074e-04, -4.2692e-05,  ...,  4.3419e-04,
         -1.0898e-04,  7.9320e-04]], device='cuda:0', requires_grad=True)

In [68]:
target.lora_B['source'].weight

Parameter containing:
tensor([[ 7.6248e-04, -1.9786e-03,  4.8263e-04,  ...,  1.9153e-03,
          8.7487e-04,  4.3365e-04],
        [-4.5555e-04,  5.8037e-04, -9.3451e-04,  ...,  7.5229e-04,
         -1.3122e-03, -1.0978e-03],
        [ 8.2923e-05,  6.3360e-04,  1.1578e-03,  ..., -1.0200e-03,
          1.1309e-03,  7.5861e-04],
        ...,
        [-3.8336e-04,  1.0643e-03, -2.4170e-04,  ..., -6.6110e-04,
         -9.0255e-04, -1.1341e-03],
        [ 3.7612e-04, -4.1074e-04,  5.2035e-04,  ...,  8.8991e-04,
          3.0590e-04,  1.0890e-03],
        [-4.1011e-05,  6.3360e-04, -4.8721e-05,  ..., -9.4151e-04,
         -8.5006e-04, -9.3489e-04]], device='cuda:0')

In [66]:
tmp

tensor([[-4.8880e-04,  1.7215e-03, -6.5175e-04,  ..., -5.0122e-04,
         -2.4051e-04, -7.5614e-04],
        [ 3.1059e-04,  3.5381e-05,  1.4740e-04,  ..., -3.1931e-04,
          4.6608e-04,  4.3688e-04],
        [ 2.7968e-04, -1.3492e-03, -9.0610e-04,  ...,  1.0190e-03,
         -1.2734e-03,  6.1405e-04],
        ...,
        [ 7.0720e-04, -2.3044e-04, -6.2206e-05,  ...,  6.4405e-04,
          2.8323e-04,  1.1092e-03],
        [-1.6991e-04, -8.0156e-04,  4.4917e-04,  ..., -9.0344e-04,
         -2.6604e-04, -1.3295e-03],
        [ 6.5246e-04,  6.2637e-05,  3.8876e-06,  ...,  8.8709e-04,
          4.7787e-04,  1.1143e-03]], device='cuda:0',
       grad_fn=<PermuteBackward0>)

In [78]:
tmp = torch.tensor([[1,2,3,4],[3,4,5,6]])
tmp

tensor([[1, 2, 3, 4],
        [3, 4, 5, 6]])

In [79]:
mt = torch.tensor([2,0,1,3])

In [80]:
tmp * mt

tensor([[ 2,  0,  3, 12],
        [ 6,  0,  5, 18]])

In [81]:
tmp2 = tmp.T
tmp2

tensor([[1, 3],
        [2, 4],
        [3, 5],
        [4, 6]])

In [83]:
(tmp2.T * mt).T

tensor([[ 2,  6],
        [ 0,  0],
        [ 3,  5],
        [12, 18]])

## Verify some results

In [24]:
i = 2
model.base_model.model.model.layers[i].self_attn.q_proj.lora_A.target.weight

Parameter containing:
tensor([[ 0.0109,  0.0153, -0.0055,  ..., -0.0022,  0.0044,  0.0006],
        [-0.0076, -0.0081,  0.0033,  ..., -0.0114, -0.0124, -0.0053],
        [-0.0155, -0.0072,  0.0006,  ..., -0.0087,  0.0047,  0.0035],
        ...,
        [ 0.0162,  0.0151,  0.0047,  ...,  0.0080, -0.0051, -0.0144],
        [ 0.0111,  0.0134,  0.0002,  ...,  0.0062, -0.0006,  0.0092],
        [ 0.0123,  0.0111, -0.0155,  ..., -0.0023, -0.0092,  0.0164]],
       device='cuda:0', requires_grad=True)

In [25]:
model.base_model.model.model.layers[i].self_attn.q_proj.lora_A.source.weight

Parameter containing:
tensor([[-0.0005,  0.0104, -0.0027,  ..., -0.0083,  0.0067, -0.0059],
        [ 0.0049, -0.0022, -0.0096,  ...,  0.0040,  0.0041, -0.0020],
        [ 0.0136, -0.0037, -0.0033,  ...,  0.0034, -0.0149, -0.0146],
        ...,
        [-0.0039,  0.0060,  0.0051,  ...,  0.0044,  0.0038,  0.0051],
        [-0.0042,  0.0079, -0.0011,  ..., -0.0014,  0.0029, -0.0080],
        [ 0.0127, -0.0015,  0.0055,  ...,  0.0044, -0.0059,  0.0054]],
       device='cuda:0')

In [26]:
model.base_model.model.model.layers[i].self_attn.q_proj.lora_A.delta.weight

Parameter containing:
tensor([[ 0.0080,  0.0060, -0.0019,  ...,  0.0042, -0.0014,  0.0045],
        [-0.0088, -0.0073,  0.0088,  ..., -0.0106, -0.0104, -0.0023],
        [-0.0204, -0.0043,  0.0027,  ..., -0.0083,  0.0123,  0.0125],
        ...,
        [ 0.0141,  0.0112, -0.0003,  ...,  0.0025, -0.0056, -0.0134],
        [ 0.0107,  0.0068,  0.0009,  ...,  0.0052, -0.0022,  0.0119],
        [-0.0003,  0.0155, -0.0143,  ..., -0.0046, -0.0021,  0.0076]],
       device='cuda:0')

In [27]:
print(vector_norm(model.base_model.model.model.layers[i].self_attn.q_proj.lora_A.target.weight, dim=0))
print(vector_norm(model.base_model.model.model.layers[i].self_attn.q_proj.lora_A.delta.weight, dim=0))

tensor([0.0322, 0.0304, 0.0237,  ..., 0.0240, 0.0205, 0.0245], device='cuda:0',
       grad_fn=<LinalgVectorNormBackward0>)
tensor([0.0322, 0.0304, 0.0237,  ..., 0.0240, 0.0205, 0.0245], device='cuda:0')


In [28]:
print(vector_norm(model.base_model.model.model.layers[i].self_attn.q_proj.lora_B.target.weight, dim=1))
print(vector_norm(model.base_model.model.model.layers[i].self_attn.q_proj.lora_B.delta.weight, dim=1))

tensor([0.0021, 0.0013, 0.0023,  ..., 0.0025, 0.0022, 0.0022], device='cuda:0',
       grad_fn=<LinalgVectorNormBackward0>)
tensor([0.0021, 0.0013, 0.0023,  ..., 0.0025, 0.0022, 0.0022], device='cuda:0')


In [16]:
i = 2
model.base_model.model.model.layers[i].self_attn.q_proj.base_layer.weight

Parameter containing:
Parameter(Int8Params([[-32, -13,  11,  ..., -14, -26, -11],
            [-19,  14, -49,  ..., -47, -23,  35],
            [-18,  42, -33,  ...,  -7, -26,  27],
            ...,
            [-43,  12, -20,  ...,  -6, -27, -33],
            [  1,  -5,   1,  ...,  -1,  -8,   7],
            [ -6, -18,  23,  ...,  56, -42,   2]], device='cuda:0',
           dtype=torch.int8))

In [17]:

model.base_model.model.model.layers[i].self_attn.q_proj

lora.Linear8bitLt(
  (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
  (lora_dropout): ModuleDict(
    (target): Dropout(p=0.05, inplace=False)
  )
  (lora_A): ModuleDict(
    (target): Linear(in_features=4096, out_features=8, bias=False)
  )
  (lora_B): ModuleDict(
    (target): Linear(in_features=8, out_features=4096, bias=False)
  )
  (lora_embedding_A): ParameterDict()
  (lora_embedding_B): ParameterDict()
)

In [19]:

model.base_model.model.model.layers[i].self_attn.k_proj.weight

Parameter containing:
Parameter(Int8Params([[ -1,   6,  -5,  ..., -12,  -5,   7],
            [ 29, -19,  -7,  ...,  27, -11,  -1],
            [ 16,  15,  42,  ..., -16,  13,   6],
            ...,
            [ 26,  38, -22,  ...,  24, -31,  -5],
            [ -7, -14,  -1,  ...,   5,   2,  -5],
            [  5, -13,  -4,  ..., -68, -23,  71]], device='cuda:0',
           dtype=torch.int8))

## Save Edited Adapter

In [None]:
# use adapter "delta"
model.set_adapter("delta")


In [106]:
model.modules_to_save

{'classifier', 'score'}

In [29]:
model.delete_adapter("target")



In [30]:
model.delete_adapter("source")

In [32]:
model.save_pretrained(f"/bime-munin/xiruod/llama2_SHAC/n500/LoraAdapters_TargetNorm//{os.path.basename(target_model_id)}/")



# Load Edited Adapter

In [12]:
model = PeftModel.from_pretrained(model, "../output/tmpData/set-1355-quantization-epoch3-llama-2-7B-loraR-8/delta/", adapter_name='delta')

In [13]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32128, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (delta): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (delta): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (delta): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(in_features=4096, out

In [14]:
model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32128, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (delta): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (delta): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (delta): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(in_features=4096, out

# add_weighted_adapter: Failed with negative weights...?

In [52]:
model.add_weighted_adapter(["target", "source"], [9, 4], combination_type="linear", adapter_name="diff")

In [62]:
model.delete_adapter('diff')

In [57]:
model.base_model.model.model.layers[2].self_attn.q_proj.scaling

{'target': 4.0, 'source': 4.0, 'diff': 1.0}

In [58]:
i = 2
model.base_model.model.model.layers[i].self_attn.q_proj.lora_A.target.weight

Parameter containing:
tensor([[ 0.0109,  0.0153, -0.0055,  ..., -0.0022,  0.0044,  0.0006],
        [-0.0076, -0.0081,  0.0033,  ..., -0.0114, -0.0124, -0.0053],
        [-0.0155, -0.0072,  0.0006,  ..., -0.0087,  0.0047,  0.0035],
        ...,
        [ 0.0162,  0.0151,  0.0047,  ...,  0.0080, -0.0051, -0.0144],
        [ 0.0111,  0.0134,  0.0002,  ...,  0.0062, -0.0006,  0.0092],
        [ 0.0123,  0.0111, -0.0155,  ..., -0.0023, -0.0092,  0.0164]],
       device='cuda:0', requires_grad=True)

In [59]:
model.base_model.model.model.layers[i].self_attn.q_proj.lora_A.source.weight

Parameter containing:
tensor([[-0.0005,  0.0104, -0.0027,  ..., -0.0083,  0.0067, -0.0059],
        [ 0.0049, -0.0022, -0.0096,  ...,  0.0040,  0.0041, -0.0020],
        [ 0.0136, -0.0037, -0.0033,  ...,  0.0034, -0.0149, -0.0146],
        ...,
        [-0.0039,  0.0060,  0.0051,  ...,  0.0044,  0.0038,  0.0051],
        [-0.0042,  0.0079, -0.0011,  ..., -0.0014,  0.0029, -0.0080],
        [ 0.0127, -0.0015,  0.0055,  ...,  0.0044, -0.0059,  0.0054]],
       device='cuda:0')

In [60]:
model.base_model.model.model.layers[i].self_attn.q_proj.lora_A.diff.weight

Parameter containing:
tensor([[ 0.0627,  0.1544, -0.0492,  ..., -0.0629,  0.0667, -0.0318],
        [-0.0161, -0.0621, -0.0378,  ..., -0.0443, -0.0496, -0.0439],
        [-0.0115, -0.0654, -0.0161,  ..., -0.0318, -0.0609, -0.0667],
        ...,
        [ 0.0736,  0.1268,  0.0589,  ...,  0.0747, -0.0075, -0.0558],
        [ 0.0412,  0.1281, -0.0052,  ...,  0.0285,  0.0140,  0.0073],
        [ 0.1501,  0.0572, -0.0595,  ...,  0.0129, -0.0902,  0.1306]],
       device='cuda:0')

In [61]:
0.0109 * 6 + (-0.0005 * math.sqrt(4*9))

0.0624

# Predict

In [12]:
sys.path.append("../src")

from utils import number_split, create_mix


from data_process import load_wls_adress_AddDomain
from process_SHAC import load_process_SHAC



class train_config:
    def __init__(self):
        self.quantization: bool = False

        


globalconfig = train_config()
globalconfig.quantization = args.quantization
globalconfig.model_id = f"/bime-munin/llama2_hf/llama-2-{args.model_size}b_hf/"
globalconfig.max_seq_length = 1024
globalconfig.num_train_epochs = 3
globalconfig.runs = 1
globalconfig.lr = 1e-4
globalconfig.warmup_ratio = 0.1
globalconfig.lora_r = args.lora_r
globalconfig.profiler = False
globalconfig.device = "cuda:0"

if args.model_size == 70:
    # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
    globalconfig.per_device_train_batch_size = 1 #2
    globalconfig.per_device_eval_batch_size = 1 #2
    

else:
    # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    globalconfig.per_device_train_batch_size = 8
    globalconfig.per_device_eval_batch_size = 8
    

if args.quantization:
    dir_q_snippet = "quantization"
else:
    dir_q_snippet = "NOquantization"

# globalconfig.output_dir = f"~/llama2_SHAC/n200/set-{args.CombinationIdx}-{dir_q_snippet}-epoch3-llama-2-{args.model_size}B-loraR-{args.lora_r}"


######  Load Data

### SHAC

z_category = ["uw", "mimic"]
y_cat = ["False", "True"]

txt_col = "text"
domain_col = "location"

if args.toPredict == "Target":
    label = "Drug"
    globalconfig.output_dir = f"/bime-munin/xiruod/llama2_SHAC/n{args.nTest}/set-{args.CombinationIdx}-{dir_q_snippet}-epoch3-llama-2-{args.model_size}B-loraR-{args.lora_r}"

    label2id = {z: idx for idx, z in zip(range(len(y_cat)), y_cat)}
    id2label = {idx: z for idx, z in zip(range(len(y_cat)), y_cat)}

    df_shac = load_process_SHAC(replaceNA="all")

    df_shac["label_binary"] = df_shac.apply(lambda x: 1 if x[label] else 0, axis=1)
    df_shac["dfSource"] = df_shac[domain_col]
    
elif args.toPredict == "Source":
    label = "location"
    globalconfig.output_dir = f"/bime-munin/xiruod/llama2_SHAC/n{args.nTest}/Source-set-{args.CombinationIdx}-{dir_q_snippet}-epoch3-llama-2-{args.model_size}B-loraR-{args.lora_r}"

    label2id = {z: idx for idx, z in zip(range(len(z_category)), z_category)}
    id2label = {idx: z for idx, z in zip(range(len(z_category)), z_category)}

    df_shac = load_process_SHAC(replaceNA="all")

    df_shac["label_binary"] = df_shac.apply(lambda x: label2id[x[label]], axis=1)
    df_shac["dfSource"] = df_shac[domain_col]
    
else:
    sys.exit("Unknown Outcome: 'Target' and 'Source' ONLY")

df_shac_uw = df_shac.query("location == 'uw'").reset_index(drop=True)
df_shac_mimic = df_shac.query("location == 'mimic'").reset_index(drop=True)

##### Split
# SHAC-Drug - Balanced Alpha
n_test = args.nTest
train_test_ratio = 4


p_pos_train_z0_ls = np.arange(
    0, 1, 0.1
)  # probability of training set examples drawn from site/domain z0 being positive
p_pos_train_z1_ls = np.arange(
    0, 1, 0.1
)  # probability of test set examples drawn from site/domain z1 being positive

p_mix_z1_ls = np.arange(0, 1, 0.05)

numvals = 1023
base = 1.1
alpha_test_ls = np.power(base, np.arange(numvals)) / np.power(base, numvals // 2)

valid_full_settings = []
for combination in itertools.product(
    p_pos_train_z0_ls, p_pos_train_z1_ls, p_mix_z1_ls, alpha_test_ls
):
    number_setting = number_split(
        p_pos_train_z0=combination[0],
        p_pos_train_z1=combination[1],
        p_mix_z1=combination[2],
        alpha_test=combination[3],
        train_test_ratio=train_test_ratio,
        n_test=n_test,
        verbose=False,
    )

    if number_setting is not None:
        if np.all([number_setting[k] >= 10 for k in list(number_setting.keys())[:-1]]):
            valid_full_settings.append(number_setting)


# run for check valid settings

import warnings

warnings.simplefilter("ignore")

# Validate settings

df0 = df_shac_uw
df1 = df_shac_mimic

  alpha_train = p_pos_train_z1 / p_pos_train_z0
  alpha_train = p_pos_train_z1 / p_pos_train_z0


In [13]:
valid_n_full_settings = []

for c in tqdm(valid_full_settings):
    c = c.copy()
    # create train/test split according to stats
    dfs = create_mix(df0=df0, df1=df1, target=label if args.toPredict == "Target" else "Drug", setting=c, sample=False, seed=222)

    if dfs is None:
        continue

    valid_n_full_settings.append(c)
    break

  0%|          | 0/46811 [00:00<?, ?it/s]

In [14]:
##### Dataset Loader and Tokenizer
def preprocess_function(examples):
    # tokenize
    ret = tokenizer(
        examples[txt_col],
        return_tensors="pt",
        max_length=globalconfig.max_seq_length,
        padding="max_length",
        truncation=True,
    ).to(globalconfig.device)

    return ret


def datasets_loader(df):
    # from pandas df to Dataset & tokenize
    ret_datasets = datasets.Dataset.from_pandas(
        df[[txt_col, "dfSource", "label_binary"]]
        .rename(columns={"label_binary": "label"})
        .reset_index(drop=True)
    )
    ret_tokenized = ret_datasets.map(preprocess_function, batched=True)

    return ret_tokenized


##### Experiment - ONLY One Setting

print("Balanced? Check setting....")
print(c)
dfs = create_mix(
    df0=df0,
    df1=df1,
    target=label if args.toPredict == "Target" else "Drug",
    setting=c,
    sample=False,
    # seed=random.randint(0,1000),
    seed=222,
)

tokenized_train = datasets_loader(dfs["train"])
tokenized_test = datasets_loader(dfs["test"])


## Define metric
def compute_metrics_twoLevels(eval_pred):
    # compute AUPRC, based on only two levels of Y
    predictions, labels = eval_pred
    probabilities = nn.functional.softmax(torch.FloatTensor(predictions), dim=-1)[:, 1]

    auprc = average_precision_score(y_true=labels, y_score=probabilities)

    return {"auprc": auprc}

Balanced? Check setting....
{'n_train': 2000, 'n_test': 500, 'n_z0_pos_train': 130, 'n_z0_neg_train': 1170, 'n_z0_pos_test': 40, 'n_z0_neg_test': 285, 'n_z1_pos_train': 70, 'n_z1_neg_train': 630, 'n_z1_pos_test': 10, 'n_z1_neg_test': 165, 'mix_param_dict': {'p_pos_train_z0': 0.1, 'p_pos_train_z1': 0.1, 'p_pos_train': 0.09999999999999999, 'p_pos_test': 0.09999999999999999, 'p_mix_z0': 0.6499999999999999, 'p_mix_z1': 0.35000000000000003, 'alpha_train': 1.0, 'alpha_test': 0.4665073802097331, 'p_pos_test_z0': 0.12295924796315696, 'p_pos_test_z1': 0.05736139663985131, 'C_y': 0.09999999999999999, 'C_z': 0.35000000000000003}}


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [12]:
df_in = tokenizer(
        list(dfs['test'].iloc[:15]['text']),
        return_tensors="pt",
        max_length=globalconfig.max_seq_length,
        padding="max_length",
        truncation=True,
    )

In [20]:
df_in

{'input_ids': tensor([[    1,   379,  2882,  ..., 32000, 32000, 32000],
        [    1,  7791,  8426,  ..., 32000, 32000, 32000],
        [    1,  7791,  8426,  ..., 32000, 32000, 32000],
        ...,
        [    1, 10307,  5298,  ..., 32000, 32000, 32000],
        [    1, 10307,  5298,  ..., 32000, 32000, 32000],
        [    1,   379,  2882,  ..., 32000, 32000, 32000]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [33]:
df_in.keys()

dict_keys(['input_ids', 'attention_mask'])

In [40]:
df_in['input_ids'].shape

torch.Size([15, 1024])

In [58]:
model.set_adapter('delta')
model.eval()
with torch.no_grad():
    tmp = model(input_ids=df_in['input_ids'], attention_mask=df_in['attention_mask'])

In [59]:
tmp['logits']

tensor([[-0.9814, -1.1426],
        [ 1.8564, -0.2979],
        [ 0.4932, -3.0684],
        [-1.5459, -2.6211],
        [-0.4260, -0.3877],
        [-0.4724, -0.4685],
        [ 1.2012, -1.4082],
        [ 0.0987, -4.5859],
        [-0.5771, -3.2051],
        [-0.9287, -0.7275],
        [ 1.1318, -3.9277],
        [-1.0410, -1.9648],
        [ 1.2861, -2.3691],
        [-0.8003, -2.6777],
        [-0.3035,  0.6147]], dtype=torch.float16)

In [60]:
from scipy.special import softmax

In [61]:
softmax(tmp['logits'], axis=1)

array([[0.54    , 0.4597  ],
       [0.896   , 0.1039  ],
       [0.9727  , 0.02759 ],
       [0.745   , 0.2544  ],
       [0.4902  , 0.5093  ],
       [0.499   , 0.501   ],
       [0.9316  , 0.0686  ],
       [0.991   , 0.00916 ],
       [0.9326  , 0.06726 ],
       [0.4497  , 0.55    ],
       [0.993   , 0.006313],
       [0.7163  , 0.2842  ],
       [0.975   , 0.02519 ],
       [0.867   , 0.1327  ],
       [0.2854  , 0.7144  ]], dtype=float16)

In [37]:
softmax(tmp['logits'], axis=1)[:,1]

array([0.466   , 0.1357  , 0.02461 , 0.2487  , 0.5625  , 0.489   ,
       0.0799  , 0.010826, 0.0621  , 0.4868  , 0.009125, 0.2264  ,
       0.02446 , 0.11005 , 0.7476  , 0.03406 , 0.03656 ], dtype=float16)

In [118]:
len(df_in['input_ids'])

15

In [64]:
l = list(range(len(df_in['input_ids'])))
n = 3
idx_ls = [l[i:i+n] for i in range(len(l)) if i%n==0]

In [65]:
idx_ls

[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14]]

In [66]:
y_ls = []
model.set_adapter('delta')
with torch.no_grad():
    for idx in idx_ls:
        model.eval()

        ret_output = model.forward(input_ids=df_in['input_ids'][idx], attention_mask=df_in['attention_mask'][idx])
        y_probs_ = softmax(ret_output["logits"], axis=1)
        y_ls.append(y_probs_)

In [63]:
np.concatenate(y_ls)

array([[0.498   , 0.502   ],
       [0.8965  , 0.1036  ],
       [0.9727  , 0.02731 ],
       [0.7334  , 0.2664  ],
       [0.5273  , 0.4727  ],
       [0.4792  , 0.521   ],
       [0.934   , 0.06586 ],
       [0.99    , 0.009224],
       [0.942   , 0.0577  ],
       [0.526   , 0.474   ],
       [0.992   , 0.008194],
       [0.7466  , 0.254   ],
       [0.975   , 0.02441 ],
       [0.8604  , 0.139   ],
       [0.2803  , 0.7197  ]], dtype=float16)

In [67]:
np.concatenate(y_ls)

array([[0.549   , 0.4514  ],
       [0.8843  , 0.1153  ],
       [0.977   , 0.02246 ],
       [0.711   , 0.2888  ],
       [0.4614  , 0.5386  ],
       [0.518   , 0.4814  ],
       [0.9233  , 0.07684 ],
       [0.9893  , 0.01095 ],
       [0.934   , 0.06586 ],
       [0.4082  , 0.592   ],
       [0.993   , 0.006588],
       [0.753   , 0.2468  ],
       [0.975   , 0.0247  ],
       [0.872   , 0.1277  ],
       [0.2664  , 0.7334  ]], dtype=float16)

# Merge Lora weights into base model

## Init

In [9]:
target_model_id = "/bime-munin/xiruod/llama2_SHAC/n500/set-1355-quantization-epoch3-llama-2-7B-loraR-8"
source_model_id = "/bime-munin/xiruod/llama2_SHAC/n500/Source-set-1355-quantization-epoch3-llama-2-7B-loraR-8"



In [10]:
##### Tokenizer
tokenizer = LlamaTokenizer.from_pretrained(f"/bime-munin/llama2_hf/llama-2-7b_hf/")

tokenizer.add_special_tokens({"pad_token": "<pad>"})

1

## Load, Merge Target & Source, Separately

In [11]:
base_model = LlamaForSequenceClassification.from_pretrained(globalconfig.model_id, device_map='auto', load_in_8bit=args.quantization, torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /bime-munin/llama2_hf/llama-2-7b_hf/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
base_model.config.pad_token_id = tokenizer.pad_token_id

base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=128)


Embedding(32128, 4096)

In [13]:
model = PeftModel.from_pretrained(base_model, source_model_id, adapter_name='target')

In [14]:
merged_Target_model = model.merge_and_unload(progressbar=True)

Unloading and merging model: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 521/521 [00:21<00:00, 24.01it/s]


In [15]:
state_dict_T = merged_Target_model.state_dict()


In [16]:
base_model = LlamaForSequenceClassification.from_pretrained(globalconfig.model_id, device_map='auto', load_in_8bit=args.quantization, torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /bime-munin/llama2_hf/llama-2-7b_hf/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
base_model.config.pad_token_id = tokenizer.pad_token_id

base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=128)


Embedding(32128, 4096)

In [18]:
model = PeftModel.from_pretrained(base_model, target_model_id, adapter_name='source')

In [19]:
merged_Source_model = model.merge_and_unload(progressbar=True)

Unloading and merging model: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 521/521 [00:17<00:00, 29.85it/s]


In [20]:
state_dict_S = merged_Source_model.state_dict()

In [12]:
i=20

In [13]:
base_model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32128, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNo

In [14]:
base_model.model.layers[i].self_attn.q_proj.weight

Parameter containing:
Parameter(Int8Params([[  6, -25,  73,  ...,  53,  10, -96],
            [ 22, -72,  34,  ..., -30, -32,  35],
            [ 13, -22, -30,  ...,  57,  -6, -17],
            ...,
            [-28, -55,  14,  ...,  -5,  26,   6],
            [-81, -20,  13,  ...,   9,  22,   9],
            [ 11,  -1,  -4,  ...,  -9,  -6,  -1]], device='cuda:1',
           dtype=torch.int8))

In [15]:
base_model.model.layers[i].self_attn.v_proj.weight

Parameter containing:
Parameter(Int8Params([[ -8, -25,  30,  ...,  -9,  23,  24],
            [-13, -77,  -1,  ..., -20,  -2,  18],
            [-12, -21,   1,  ..., -29,   0,  -1],
            ...,
            [ 13, -16,  -8,  ...,  39, -10,  13],
            [ 35,  20, -19,  ...,   5,  26, -17],
            [  5, -12,  55,  ..., -14, -23, -86]], device='cuda:1',
           dtype=torch.int8))

In [24]:
merged_Target_model.model.layers[i].self_attn.q_proj.weight

Parameter containing:
Parameter(Int8Params([[  5, -25,  73,  ...,  53,  10, -95],
            [ 21, -72,  34,  ..., -30, -32,  35],
            [ 14, -22, -30,  ...,  57,  -6, -17],
            ...,
            [-28, -55,  14,  ...,  -5,  26,   6],
            [-81, -20,  13,  ...,   9,  22,   9],
            [ 11,  -1,  -4,  ...,  -9,  -6,  -1]], device='cuda:1',
           dtype=torch.int8))

In [25]:
merged_Source_model.model.layers[i].self_attn.q_proj.weight

Parameter containing:
Parameter(Int8Params([[  6, -25,  74,  ...,  55,  10, -96],
            [ 22, -72,  34,  ..., -31, -32,  35],
            [ 13, -22, -30,  ...,  58,  -6, -17],
            ...,
            [-28, -55,  14,  ...,  -5,  26,   6],
            [-81, -20,  13,  ...,   9,  22,   9],
            [ 11,  -1,  -4,  ...,  -9,  -6,  -1]], device='cuda:1',
           dtype=torch.int8))

In [26]:
merged_Target_model.model.layers[i].self_attn.v_proj.weight

Parameter containing:
Parameter(Int8Params([[ -8, -25,  30,  ...,  -9,  23,  24],
            [-12, -77,  -1,  ..., -20,  -2,  18],
            [-12, -21,   1,  ..., -29,   0,  -1],
            ...,
            [ 13, -16,  -8,  ...,  39, -10,  13],
            [ 35,  20, -19,  ...,   5,  26, -17],
            [  5, -12,  55,  ..., -13, -23, -86]], device='cuda:1',
           dtype=torch.int8))

In [27]:
merged_Source_model.model.layers[i].self_attn.v_proj.weight

Parameter containing:
Parameter(Int8Params([[ -8, -25,  30,  ...,  -9,  23,  24],
            [-12, -77,  -1,  ..., -20,  -2,  18],
            [-13, -21,   1,  ..., -29,   0,  -1],
            ...,
            [ 13, -16,  -8,  ...,  39, -10,  13],
            [ 35,  20, -19,  ...,   5,  26, -17],
            [  5, -12,  56,  ..., -14, -23, -86]], device='cuda:1',
           dtype=torch.int8))

## Calculate Weight Delta

In [24]:
print(state_dict_T['model.embed_tokens.weight'])
print(state_dict_S['model.embed_tokens.weight'])


tensor([[ 1.2517e-06, -1.7881e-06, -4.3511e-06,  ...,  8.9407e-07,
         -6.5565e-06,  8.9407e-07],
        [ 1.8616e-03, -3.3722e-03,  3.9864e-04,  ..., -8.3008e-03,
          2.5787e-03, -3.9368e-03],
        [ 1.0986e-02,  9.8877e-03, -5.0964e-03,  ...,  2.5177e-03,
          7.7057e-04, -5.0049e-03],
        ...,
        [ 3.3234e-02, -1.4999e-02, -1.0635e-02,  ..., -3.6407e-02,
          1.4915e-02, -1.4160e-02],
        [-1.1360e-02, -8.0719e-03, -4.4342e-02,  ...,  1.6851e-03,
          8.5449e-03,  1.7471e-02],
        [ 1.2199e-02,  1.6403e-02,  4.8981e-03,  ...,  3.1464e-02,
         -2.2583e-02,  1.5732e-02]], device='cuda:0', dtype=torch.float16)
tensor([[ 1.2517e-06, -1.7881e-06, -4.3511e-06,  ...,  8.9407e-07,
         -6.5565e-06,  8.9407e-07],
        [ 1.8616e-03, -3.3722e-03,  3.9864e-04,  ..., -8.3008e-03,
          2.5787e-03, -3.9368e-03],
        [ 1.0986e-02,  9.8877e-03, -5.0964e-03,  ...,  2.5177e-03,
          7.7057e-04, -5.0049e-03],
        ...,
        

In [23]:
print(state_dict_T['model.layers.2.self_attn.v_proj.weight'])
print(state_dict_S['model.layers.2.self_attn.v_proj.weight'])

tensor([[-11,   2,  47,  ...,  86, -13,  11],
        [  7,  33, -51,  ..., -23,  12, -37],
        [ -1,  30,  -2,  ..., -32, -58, -59],
        ...,
        [  1, -85, -52,  ..., -36,  -6,   3],
        [-76,  -1,  37,  ..., -16, -22, -12],
        [-28,  38,   3,  ..., -12,  36, -18]], device='cuda:0',
       dtype=torch.int8)
tensor([[-10,   2,  47,  ...,  87, -13,  11],
        [  7,  33, -51,  ..., -22,  12, -37],
        [ -1,  30,  -2,  ..., -32, -58, -59],
        ...,
        [  1, -85, -53,  ..., -36,  -5,   3],
        [-76,  -1,  37,  ..., -16, -22, -12],
        [-28,  38,   3,  ..., -12,  36, -18]], device='cuda:0',
       dtype=torch.int8)


In [25]:
for k in state_dict_T.keys():
    if k.endswith(".weight"):
        state_dict_T[k] = state_dict_T[k] - state_dict_S[k]

## Save Weights-Delta

In [29]:
torch.save(state_dict_T, f"/bime-munin/xiruod/llama2_SHAC/n500/Weights/{os.path.basename(target_model_id)}-delta.pth")

## Restart: Load Weights-Delta

In [9]:
state_dict_delta = torch.load(f"/bime-munin/xiruod/llama2_SHAC/n500/Weights/{os.path.basename(target_model_id)}-delta.pth")

## Merge Weights-Delta with Original Weights

In [10]:
base_model = LlamaForSequenceClassification.from_pretrained(globalconfig.model_id, device_map='auto', load_in_8bit=args.quantization)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /bime-munin/llama2_hf/llama-2-7b_hf/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
base_model.config.pad_token_id = tokenizer.pad_token_id

base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=128)


Embedding(32128, 4096)

In [12]:
state_dict_o = base_model.state_dict()

In [13]:
print(state_dict_o['model.embed_tokens.weight'])
print(state_dict_delta['model.embed_tokens.weight'])

tensor([[ 1.2517e-06, -1.7881e-06, -4.3511e-06,  ...,  8.9407e-07,
         -6.5565e-06,  8.9407e-07],
        [ 1.8616e-03, -3.3722e-03,  3.9864e-04,  ..., -8.3008e-03,
          2.5787e-03, -3.9368e-03],
        [ 1.0986e-02,  9.8877e-03, -5.0964e-03,  ...,  2.5177e-03,
          7.7057e-04, -5.0049e-03],
        ...,
        [-4.7569e-03,  1.9592e-02, -3.5038e-03,  ...,  4.4128e-02,
         -1.3905e-03, -3.4256e-03],
        [-7.8964e-03, -4.3365e-02,  1.8036e-02,  ...,  1.7868e-02,
          1.3466e-02, -7.0000e-03],
        [ 2.5375e-02,  8.3771e-03,  7.8430e-03,  ..., -1.2291e-02,
         -5.0140e-02, -2.1240e-02]], device='cuda:0', dtype=torch.float16)
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0026, -0.0123, -0.0181,  ..., -0.0210,  0.0288, -0.0116],
        [-0.0161, -0.0060, -0.0545,  

In [19]:
print(state_dict_o['model.layers.12.self_attn.q_proj.weight'])
print(state_dict_delta['model.layers.12.self_attn.q_proj.weight'])
print(state_dict_delta['model.layers.12.self_attn.q_proj.weight'].sum())


tensor([[ -7, -33,  16,  ...,  31,  -8,  13],
        [ 10,  12,   2,  ..., -27,  55,  36],
        [ 17,  73, -60,  ...,   7,  28, -65],
        ...,
        [ -6,  52, -28,  ..., -42, -12,  15],
        [ 47,  -8,  15,  ...,  47,   6, -44],
        [ 23,   9, -28,  ..., -12,  12,  17]], device='cuda:0',
       dtype=torch.int8)
tensor([[ 0, -1,  1,  ...,  0,  1,  0],
        [ 0,  0,  0,  ...,  0, -2,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0, -1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]], device='cuda:0', dtype=torch.int8)
tensor(-210, device='cuda:0')


In [17]:
gamma = 1
for k in state_dict_o.keys():
    if k.endswith(".weight"):
        state_dict_o[k] = state_dict_o[k] + gamma * state_dict_delta[k]

In [21]:
# for k in state_dict_o.keys():
#     if k.endswith(".weight"):
#         state_dict_o[k] = state_dict_o[k].to_sparse()

# print(state_dict_o['model.layers.12.self_attn.q_proj.weight'])
# print(state_dict_delta['model.layers.12.self_attn.q_proj.weight'])
# print(state_dict_delta['model.layers.12.self_attn.q_proj.weight'].sum())


In [18]:
torch.save(state_dict_o, f"/bime-munin/xiruod/llama2_SHAC/n500/Weights/{os.path.basename(target_model_id)}-gamma_1-added.pth")

## Predict

In [9]:
base_model = LlamaForSequenceClassification.from_pretrained(globalconfig.model_id, device_map='auto', load_in_8bit=args.quantization)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /bime-munin/llama2_hf/llama-2-7b_hf/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
base_model.config.pad_token_id = tokenizer.pad_token_id

base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=128)


Embedding(32128, 4096)

In [11]:
base_model.load_state_dict(torch.load(f"/bime-munin/xiruod/llama2_SHAC/n500/Weights/{os.path.basename(target_model_id)}-gamma_1-added.pth"))

<All keys matched successfully>

In [15]:
df_in = tokenizer(
        list(dfs['test'].iloc[:15]['text']),
        return_tensors="pt",
        max_length=globalconfig.max_seq_length,
        padding="max_length",
        truncation=True,
    )

In [16]:
from scipy.special import softmax

In [17]:
l = list(range(len(df_in['input_ids'])))
n = 3
idx_ls = [l[i:i+n] for i in range(len(l)) if i%n==0]

In [18]:
idx_ls

[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14]]

In [19]:
y_ls = []
with torch.no_grad():
    for idx in idx_ls:
        base_model.eval()

        ret_output = base_model.forward(input_ids=df_in['input_ids'][idx], attention_mask=df_in['attention_mask'][idx])
        y_probs_ = softmax(ret_output["logits"], axis=1)
        y_ls.append(y_probs_)

In [20]:
np.concatenate(y_ls)

array([[9.990e-01, 1.081e-03],
       [2.404e-01, 7.598e-01],
       [1.327e-02, 9.863e-01],
       [9.971e-01, 2.510e-03],
       [1.000e+00, 1.925e-05],
       [1.000e+00, 1.383e-05],
       [1.454e-01, 8.550e-01],
       [9.990e-01, 9.327e-04],
       [1.000e+00, 1.882e-04],
       [9.922e-01, 7.519e-03],
       [7.573e-01, 2.428e-01],
       [1.000e+00, 9.030e-05],
       [9.839e-01, 1.620e-02],
       [9.990e-01, 1.129e-03],
       [2.672e-03, 9.971e-01]], dtype=float16)

In [5]:
tmp = [x for x in "/bime-munin/xiruod/llama2_SHAC/n500/Weights/set-1355-quantization-epoch3-llama-2-7B-loraR-8-gamma_1-added.pth".split("/") if "set-" in x]
name_pre = tmp[0].split(".")[0]  # of form like set-1355-quantization-epoch3-llama-2-7B-loraR-8-gamma_1-added.pth
model_size = int([x for x in name_pre.split("-") if "B" in x][0].replace("B",""))  # 7, 13, 70
assert model_size in (7, 13, 70)

In [2]:
name_pre

'set-1355-quantization-epoch3-llama-2-7B-loraR-8-gamma_1-added'

In [3]:
model_size

'7B'