In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        # Print CUDA details
        print(f"CUDA Device: {torch.cuda.get_device_name()}")
        print(f"CUDA Memory Allocated: {torch.cuda.memory_allocated()/1024**2:.2f}MB")
        print(f"CUDA Memory Reserved: {torch.cuda.memory_reserved()/1024**2:.2f}MB")
        return device
    else:
        return torch.device("cpu")

In [None]:
DEVICE = get_device()

In [2]:
def load_model():
    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
    model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
                                                 torch_dtype=torch.bfloat16,
                                                 device_map="auto")
    
    # Move model to GPU if available
    if DEVICE.type == "cuda":
        model = model.to(DEVICE)
        print(f"Model loaded on {DEVICE}")
    else:
        print("CUDA not available, using CPU")
    
    return tokenizer, model

tokenizer, model = load_model()


In [None]:
model.eval()

In [7]:
def prepare_prompt(prompt: str, tokenizer) -> str:
    """Format the prompt using the chat template if available"""
    if hasattr(tokenizer, 'apply_chat_template'):
        messages = [{"role": "user", "content": prompt}]
        formatted_prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        return formatted_prompt
    return prompt

In [None]:
prompt = '''Question: In a certain city, 40% of the population are college graduates, and 60% are not. Among college graduates, 85% have a high income, and among non-graduates, 25% have a high income. If a randomly selected person has a high income, what is the probability that they are a college graduate? Please provide your answer step by step.'''

prompt = prepare_prompt(prompt, tokenizer)
print(prompt)


In [4]:
def generate_text(inputs, max_length=100):
    # Move inputs to the same device as the model
    if DEVICE.type == "cuda":
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    
    # Generate text
    with torch.no_grad():
        # Generate directly without nullcontext
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            return_dict_in_generate=True,
            output_scores=True,
            use_cache=True  # Enable KV caching for faster generation
        )
    
    # Get the generated token IDs
    generated_ids = output.sequences[0]
    
    # Move back to CPU for decoding if needed
    if DEVICE.type == "cuda":
        generated_ids = generated_ids.cpu()
    
    # Decode the generated text with special tokens visible
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=False)
    
    # Ensure CUDA synchronization is complete
    if DEVICE.type == "cuda":
        torch.cuda.synchronize()
    
    return generated_text

---

In [None]:
prompt = '''Question: In a certain city, 40% of the population are college graduates, and 60% are not. Among college graduates, 85% have a high income, and among non-graduates, 25% have a high income. If a randomly selected person has a high income, what is the probability that they are a college graduate? Please provide your answer step by step.'''

prompt = prepare_prompt(prompt, tokenizer)
print(prompt)

result = generate_text(prompt, max_length=32768)
print(result)

count the number of tokens

In [1]:
def count_tokens(text):
    return len(tokenizer.encode(text))

In [None]:
t = "sdfsdfsdfgs"
count_tokens(t)