In [8]:
from importlib.metadata import version

## 当前代码需要用到的包
pkgs = ['transformers', 'sentencepiece', 'sentence_transformers']

for pkg in pkgs:
    print(f"{pkg}:", version(pkg))


import os
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"
os.environ['TRANSFORMERS_CACHE'] = "/root/autodl-tmp/LLMs/.cache/huggingface"


transformers: 4.55.4
sentencepiece: 0.2.1
sentence_transformers: 5.1.0


# Tokenization and Embeddings

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-4-mini-instruct",
    device_map='cuda',
    torch_dtype='auto',
    trust_remote_code=False,
)

tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-4-mini-instruct')



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happend.<|assistant|>"

# tokenizer the prompt
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')

# generate the output
generation_output = model.generate(
    inputs=input_ids,
    max_new_tokens=20,
)

print(f'Input_ids: {input_ids}')
print(f'Output_ids: {generation_output}')

print(tokenizer.decode(generation_output[0]))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Input_ids: tensor([[ 10930,    448,   3719,  39950,   6396,    316,  32145,    395,    290,
          62374,  66241,  80785,    403,     13, 115474,   1495,    480,   3034,
            419,     13, 200019]], device='cuda:0')
Output_ids: tensor([[ 10930,    448,   3719,  39950,   6396,    316,  32145,    395,    290,
          62374,  66241,  80785,    403,     13, 115474,   1495,    480,   3034,
            419,     13, 200019,  18174,     25,    336,   2768,    512,   6537,
          10384,    395,    290, 193145, 147276,    403,    279,  36210,  32145,
           4464,     40,   5498,    495,   3719]], device='cuda:0')
Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happend.<|assistant|>Subject: Sincere Apologies for the Gardening Mishap

Dear Sarah,

I hope this email


# Token Embeddings

In [None]:
from transformers import AutoModel, AutoTokenizer

# Model
model = AutoModel.from_pretrained("microsoft/deberta-v3-small")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")

# tokenization
tokens = tokenizer("Hello world", return_tensors='pt')

# process the tokens
output = model(**tokens)[0]

# shape of output
print(f'Shape of output: {output.shape}')

# 4 tokens include [CLS] and [SEP]
for token in tokens['input_ids'][0]:
    print(tokenizer.decode(token))



Shape of output: torch.Size([1, 4, 768])
[CLS]
Hello
world
[SEP]


# Text Embeddings

In [9]:
from sentence_transformers import SentenceTransformer

# load model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# convert text to text embeddings
vector = model.encode("Best movie ever!")

print(f'vector.shape: {vector.shape}')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

vector.shape: (768,)
