In [5]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [5]:
def check_gpu():
    """Check if GPU is available and print information about it."""
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        return device
    else:
        print("No GPU available, using CPU.")
        return torch.device("cpu")

In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


def load_llama_model(model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0", quantize=False):
    """
    Load a LLAMA model variant using PyTorch.
    
    Args:
        model_name: The model to load from Hugging Face
        quantize: Whether to apply 4-bit quantization
        
    Returns:
        model: The loaded model
        tokenizer: The tokenizer for the model
    """
    print(f"Loading model: {model_name}")
    device = check_gpu()
    
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Set up quantization options if enabled
    if quantize:
        print("Loading with 4-bit quantization...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto"
        )
    else:
        # Load without quantization
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto"
        )
    
    print(f"Model loaded with {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M parameters")
    return model, tokenizer


In [11]:

def generate_text(model, tokenizer, prompt, max_new_tokens=512, temperature=0.7):
    """
    Generate text using the loaded model.
    
    Args:
        model: The loaded model
        tokenizer: The tokenizer for the model
        prompt: The input prompt to generate from
        max_new_tokens: Maximum number of new tokens to generate
        temperature: Controls randomness (lower is more deterministic)
        
    Returns:
        The generated text as a string
    """
    # Prepare the model inputs
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=0.95,
            top_k=50,
            repetition_penalty=1.1,
        )
    
    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [12]:
# Choose model and whether to apply quantization
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
use_quantization = False  # Set to True for larger models

In [13]:
model, tokenizer = load_llama_model(model_name, quantize=use_quantization)


Loading model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
Using GPU: NVIDIA GeForce RTX 4080
GPU Memory: 16.83 GB


2025-03-16 16:57:57.705080: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-16 16:57:57.712545: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742115477.721131   27971 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742115477.723643   27971 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742115477.730474   27971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Model loaded with 1100.05M parameters


In [None]:

# Try a simple prompt
prompt = "Explain Rust programming language in simple terms:"

print("\nPrompt:", prompt)
print("\nGenerating response...")

response = generate_text(model, tokenizer, prompt)

print("\nGenerated response:")
print(response)


Prompt: Explain quantum computing in simple terms:

Generating response...

Generated response:
Explain quantum computing in simple terms:

Quantum computers use qubits to store and process data. When the computer is turned on, it acts as a classical computer that performs calculations using bits (1s and 0s). But when the computer is switched on, it can perform quantum computations. These calculations are performed by manipulating the qubits rather than the classical bits, which allows for more efficient processing. The most common application of quantum computing is in solving complex problems that would take a long time or be impossible to solve with classical computers.
