In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer,AutoConfig,AutoModel
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_name = "../../model/gpt2"
CLLM = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
tokenizer= AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
llm = AutoModel.from_pretrained(model_name, trust_remote_code=True)



### CLLM 跟 LLM的区别
- 多了一个MLPHead,维度为(outputsize,vocal_size)

### gpt2属于自回归模型
- 对于输入x,得到$y_1$,然后将[x,$y_1$]作为输入,接着得到$y_2$
### greed_search vs. beam_search
- greed_search:贪心策略，对于每次的输入y,总是选择概率最大的那个,弊端：输出缺乏多样性
- beam_search:束搜索，输入的时候进行束(k)展开，每次输出都进行k展开，最后计算得到最大概率的输出

### 对logits
- 对任意句子(i have a )，进行tokenizer编码后(假设不包含special_token)，得到input_ids,长度为tokens_len
- 将input_ids输入到model中,即output = model(input_ids=input_ids),
- 得到output.logits,维度为(batch_size,tokens_len,vocab_size)
- 对于非最后一个token,即(batch_size,tokens_len[i],vocab_size),i!=-1,输出的含义:model更加偏向对该位置token的预测,
    - 例如(batch_size,tokens_len[0],vocab_size),是指model对第一个token(i)的预测概率的一个排序
- 而对于最后一个token,即(batch_size,tokens_len[-1],vocab_size),输出的含义:model更加偏向对下一个token的预测
    - 例如(batch_size,tokens_len[-1],vocab_size),更多的是对apple这个token的预测，而不是a这个token的预测

### 函数
- torch.argsort(),返回张量排序后的索引（indices）
    - 例如：lst = [3.1,-0.8,2.5],torch.argsort(lst),得到[1,2,0](升序顺序)
    - torch.argsort(lst,descending=True),得到[0,2,1](降序顺序)
- torch.argmax(),返回张量顺序后最大的索引
    - torch.argmax(lst),得到[0]
  
### 如何理解tokenizer.decode()的时候，是tokenizer.decode(torch.argmax(lst)),是decode的索引,而不是logits
- 对于model(input_ids = input_ids).logits[:,-1,:],得到的是模型对于vocab再该位置输出token的概率
- 其中logits[:,-1,:]潜在的包含了顺序的关系，可以理解为logits[:,-1,:][0],表示的是对于词汇表token ids为0对应的token的概率
- 然后经过torch.argmax(lst)得到最大概率的索引,假设为100,那么logits[:,-1,:][100],是最大的概率对应的logits,那么根据上面可以得到logits[:,-1,:][100]表示的是对于词汇表第token ids为1000对应的token的概率
- 然后经过tokenizer.decode(torch.argmax(lst)),即tokenizer.decode([100]),就是相应的token

In [3]:
sentence = "Hello, how are you?"

In [11]:
tokenizer.vocab.values()

dict_values([24107, 628, 7463, 3971, 800, 2910, 42718, 5422, 5079, 45074, 3967, 9143, 20002, 24917, 10764, 30760, 33789, 38187, 48368, 19658, 22353, 13235, 39428, 99, 49319, 993, 20227, 45896, 31520, 26766, 19628, 32059, 18723, 784, 36061, 48290, 49746, 1629, 31852, 24913, 2549, 13452, 25582, 34395, 33379, 19735, 24127, 20932, 24072, 38532, 8982, 11928, 41362, 31803, 47625, 13718, 27766, 32233, 37416, 27307, 48573, 7937, 11123, 15343, 12905, 344, 27684, 18947, 48831, 49388, 36173, 32668, 22089, 11861, 23034, 6554, 10967, 37285, 41532, 9846, 37962, 20654, 9202, 49431, 44634, 37337, 20419, 16663, 47329, 12476, 40830, 33720, 36238, 35491, 10876, 24418, 14854, 46761, 45390, 8444, 5981, 40344, 32541, 43991, 47989, 11290, 23834, 32996, 34143, 41386, 31527, 11118, 36709, 4106, 15263, 34963, 7251, 29715, 40498, 12787, 14921, 17677, 48202, 39413, 5062, 24617, 31983, 30157, 48742, 12519, 49440, 29238, 10752, 13560, 36450, 14021, 4976, 4069, 26306, 31285, 21774, 23852, 13993, 18018, 49468, 45937,

In [4]:
input_ids = tokenizer.encode(sentence, return_tensors="pt")
input_ids[0]

tensor([15496,    11,   703,   389,   345,    30])

In [28]:
tokenizer(sentence, return_tensors="pt")

{'input_ids': tensor([[15496,    11,   703,   389,   345,    30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [56]:
CLLM.eval()
num_steps = 10
predicet_num = 5
inputs = tokenizer(sentence, return_tensors="pt")
for step in range(1):
    with torch.no_grad():
        outputs = CLLM(input_ids=input_ids)
        logits = outputs.logits
        print(logits.shape)
        # Get the predicted token ids
        print(logits[:, -1, :])
        percentage = torch.softmax(logits[:, -1, :], dim=-1,dtype=torch.float32)
        print(percentage)
        predicted_token_ids = torch.argsort(percentage, dim=-1,descending=True)
        print(predicted_token_ids)
        # dict = {}
        # for num in range(predicet_num):
        #     dict["percentage"+f"位置{num}"] = str(percentage[num]*100)+ "%"
        # dict["sentence"+f"第{step}次"] = tokenizer.decode(input_ids).join()
        # input_ids = torch.cat([input_ids, predicted_token_ids[0].unsqueeze(-1)], dim=-1) 


# print(dict)


torch.Size([1, 6, 50257])
tensor([[-118.5744, -117.4402, -120.4907,  ..., -128.7235, -129.1531,
         -116.4908]])
tensor([[1.5548e-04, 4.8332e-04, 2.2879e-05,  ..., 6.0807e-09, 3.9572e-09,
         1.2490e-03]])
tensor([[  198,   314,  1867,  ..., 19476, 33434, 13945]])


In [63]:
import torch.nn.functional as F
sentence = "Hello, how are you?"

# 将输入句子编码为模型所需的格式
inputs = tokenizer(sentence, return_tensors="pt")

# 获取输入的 token IDs
input_ids = inputs['input_ids']

# 进行模型推理，获取logits
with torch.no_grad():
    outputs = CLLM(input_ids=input_ids)
    logits = outputs.logits
    print(f"Logits shape: {logits.shape}")

# 对最后一个位置的logits应用softmax来得到概率分布
probs = F.softmax(logits[:, -1, :], dim=-1)

# 使用argsort来获取排序后的token id（按概率降序排列）
predicted_token_ids = torch.argsort(probs, dim=-1, descending=True)

# 获取前k个概率最大的token及其概率
top_k = 10  # 你可以设置为任何数字
top_k_token_ids = predicted_token_ids[:, :top_k]
print(top_k_token_ids)
top_k_probs = probs.gather(-1, top_k_token_ids)
print(top_k_probs)

# 打印前k个token及其概率
for i in range(top_k):
    token_id = top_k_token_ids[0, i].item()  # 获取token id
    print(token_id)
    token_prob = top_k_probs[0, i].item()   # 获取对应的概率
    token = tokenizer.decode([token_id])    # 解码为文本
    print(f"Token: {token}, Probability: {token_prob:.4f}")

Logits shape: torch.Size([1, 6, 50257])
tensor([[ 198,  314, 1867, 1374,  921, 4231, 1148,  632, 2141,  628]])
tensor([[0.1737, 0.0941, 0.0329, 0.0322, 0.0317, 0.0304, 0.0173, 0.0144, 0.0144,
         0.0133]])
198
Token: 
, Probability: 0.1737
314
Token:  I, Probability: 0.0941
1867
Token:  What, Probability: 0.0329
1374
Token:  How, Probability: 0.0322
921
Token:  You, Probability: 0.0317
4231
Token:  Are, Probability: 0.0304
1148
Token:  Is, Probability: 0.0173
632
Token:  It, Probability: 0.0144
2141
Token:  Do, Probability: 0.0144
628
Token: 

, Probability: 0.0133


In [53]:
tokenizer.decode([198])

'\n'

In [None]:
argument 'ids': 'float' object cannot be interpreted as an integer

In [9]:
dict = {}
num = 1
dict["percentage"+f"位置{num}"] = 1

In [10]:
dict

{'percentage位置1': 1}

In [None]:
[11,  314,  546,  345, 1804,  198]


In [14]:
tokenizer.decode(  [198])

'\n'

In [47]:
output_text

', I about you doing\n'

In [17]:
tokenizer.convert_tokens_to_ids(tokenizer.special_tokens_map.values())

[50256, 50256, 50256]