In [1]:
from transformers import LlamaForSequenceClassification,pipeline,CodeLlamaTokenizer,RobertaForMaskedLM,RobertaTokenizer
import torch
from peft import PeftModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = "codellama/CodeLlama-7b-hf"
tokenizer = CodeLlamaTokenizer.from_pretrained(base_model,max_length = 1024,pad_token = "<|pad|>")
model = LlamaForSequenceClassification.from_pretrained(
    base_model,
    load_in_8bit = True,
    torch_dtype = torch.float16,
    num_labels = 66,
    device_map = "auto"
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.13s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at codellama/CodeLlama-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
model.config.pad_token_id = 32016
model = PeftModelForSequenceClassification.from_pretrained(model,"../finetune/classification/ljcoutputdir")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')      
model.resize_token_embeddings(len(tokenizer))
model.to(device)

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32017. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32017, 4096)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear8bitLt(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
        

In [10]:
text = '''def main():
a = 1
a += 1'''
input_ids = tokenizer(text,padding="max_length",truncation=True,return_tensors = "pt")


Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [15]:
model.eval()
output = model(**input_ids)

In [10]:
embedding = model.get_input_embeddings()

In [30]:
embedding.eval()

Embedding(32017, 4096)

In [61]:
torch.mean(embedding(torch.tensor(tokenizer("a")["input_ids"])),dim = 0)


tensor([-0.0067, -0.0033, -0.0077,  ..., -0.0047, -0.0003, -0.0013],
       device='cuda:0', grad_fn=<MeanBackward1>)

In [60]:
torch.mean(embedding(torch.tensor(tokenizer("a1")["input_ids"])),dim = 0)

tensor([-0.0025, -0.0045,  0.0046,  ..., -0.0066,  0.0004,  0.0123],
       device='cuda:0', grad_fn=<MeanBackward1>)

In [62]:
torch.mean(embedding(torch.tensor(tokenizer("a1")["input_ids"])),dim = 0) - torch.mean(embedding(torch.tensor(tokenizer("a")["input_ids"])),dim = 0)

tensor([ 0.0042, -0.0012,  0.0122,  ..., -0.0018,  0.0007,  0.0135],
       device='cuda:0', grad_fn=<SubBackward0>)

In [70]:
subs = ["a","a1"]

tokenizer(subs)["input_ids"]
torch.mean(embedding(torch.tensor(tokenizer(subs)["input_ids"][0])),dim = 0)

tensor([-0.0067, -0.0033, -0.0077,  ..., -0.0047, -0.0003, -0.0013],
       device='cuda:0', grad_fn=<MeanBackward1>)

In [71]:
l1 = torch.tensor([1,1,2])
l2 = torch.tensor([2,2,1])
torch.dot(l1,l2)

tensor(6)

In [79]:
tokenizer.model_max_length = 5

In [1]:
def sort_sub(w1:str,s1:list,w2:str,subs2:list):
        '''
        w1待替换词汇,s1成功替换后的词汇:有beam_size个
        '''
        #计算词汇的embedding,w1为已经成功替换的词汇，w2为本次将要替换的词汇
        w1_embedding = torch.mean(embedding(torch.tensor(tokenizer(w1)["input_ids"])) , dim = 0)
        w2_embedding = torch.mean(embedding(torch.tensor(tokenizer(w2)["input_ids"])) , dim = 0)

        #计算上一次成功替换后词汇的embedding,s1的大小为beam_size
        s1_embedding = torch.mean(embedding(torch.tensor(tokenizer(s1)["input_ids"])) , dim = 0)
        delta_embedding = s1_embedding - w1_embedding
        
        #计算本次所有待替换词汇的embedding,subs2的大小为词汇表大小
        print(tokenizer(subs2,padding = "max_length",truncation=True)["input_ids"])
        subs_embeddings = torch.mean(embedding(torch.tensor(tokenizer(subs2)["input_ids"])) , dim = 0)
        subs_delta_embeddings = subs_embeddings - w2_embedding

        score = torch.matmul(subs_delta_embeddings,delta_embedding.T).sum(dim = 1).tolist()
        vec = list(zip(subs2,score))
        vec.sort(key = lambda x:x[1],reverse = True)
        subs_final = []
        print(score)
        for i in vec:
            subs_final.append(i[0])
        return subs_final

In [2]:
sort_sub("a",["a1","a2","a3"],"b",["b1","b0","b2","b4","b-1"])

NameError: name 'torch' is not defined