In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import json


# Model id
model_id = "microsoft/Phi-3-mini-128k-instruct"

# Load the configuration from the pretrained model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    torch_dtype="auto",  
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Ensure CUDA is available and set up for DataParallel
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("cuda")
else:
    device = torch.device("cpu")

# Wrap the model with DataParallel
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model).to(device)

pipe = pipeline(
    "text-generation",
    model=model.module,
    tokenizer=tokenizer
)

generation_args = {
    "max_new_tokens": 1500,
    "return_full_text": False,
    "temperature": 0.7,
    "do_sample": True,
}

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


cuda


In [2]:
for name, param in model.named_parameters():
    print(f"{name}: {param.device}")

module.model.embed_tokens.weight: cuda:0
module.model.layers.0.self_attn.o_proj.weight: cuda:0
module.model.layers.0.self_attn.qkv_proj.weight: cuda:0
module.model.layers.0.mlp.gate_up_proj.weight: cuda:0
module.model.layers.0.mlp.down_proj.weight: cuda:0
module.model.layers.0.input_layernorm.weight: cuda:0
module.model.layers.0.post_attention_layernorm.weight: cuda:0
module.model.layers.1.self_attn.o_proj.weight: cuda:0
module.model.layers.1.self_attn.qkv_proj.weight: cuda:0
module.model.layers.1.mlp.gate_up_proj.weight: cuda:0
module.model.layers.1.mlp.down_proj.weight: cuda:0
module.model.layers.1.input_layernorm.weight: cuda:0
module.model.layers.1.post_attention_layernorm.weight: cuda:0
module.model.layers.2.self_attn.o_proj.weight: cuda:0
module.model.layers.2.self_attn.qkv_proj.weight: cuda:0
module.model.layers.2.mlp.gate_up_proj.weight: cuda:0
module.model.layers.2.mlp.down_proj.weight: cuda:0
module.model.layers.2.input_layernorm.weight: cuda:0
module.model.layers.2.post_atte

In [None]:
user_input = ""

output = pipe([{"role": "user", "content": user_input}], **generation_args)
print("Assistant: ", output[0]['generated_text'])