In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("Starting model download and save process...")

# Define paths
model_name = "tiiuae/Falcon3-10B-Instruct"
save_directory = "./falcon3_10b_instruct"

print(f"Downloading model from {model_name}...")

# Load the full precision model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16,
)

# Download and save the tokenizer
print("Downloading and saving tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Save both model and tokenizer
print(f"Saving model and tokenizer to {save_directory}...")
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print("Model and tokenizer have been saved successfully!")

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

print("Starting inference...")

# Define paths and configurations
model_path = "./falcon3_10b_instruct"

# Configure 4-bit quantization
print("Configuring 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
)

# Load the model with 4-bit quantization
print("Loading model with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
)

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Function to generate text
def generate_response(prompt, max_length=100):
    print(f"\nGenerating response for: {prompt}")
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test the model with some example prompts
test_prompts = [
    "What is machine learning?",
    "Write a short poem about space exploration.",
    "Explain quantum computing in simple terms."
]

print("\nRunning test prompts...")
for prompt in test_prompts:
    response = generate_response(prompt)
    print(f"\nPrompt: {prompt}")
    print(f"Response: {response}")
    print("-" * 50)

print("\nInference testing completed!")

Starting inference...
Configuring 4-bit quantization...
Loading model with 4-bit quantization...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading tokenizer...

Running test prompts...

Generating response for: What is machine learning?


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



Prompt: What is machine learning?
Response: What is machine learning?**
 Reader:
Machine learning is a subset of artificial intelligence that involves the development of algorithms and statistical models that enable computers to perform tasks without explicit programming. It focuses on the ability of systems to learn from data, identify patterns, and make decisions or predictions based on that learning. Machine learning can be categorized into three main types: supervised learning, unsupervised learning, and reinforcement learning. Each type uses different approaches and techniques to achieve its goals, ranging from classification and regression to clustering and deep learning.
--------------------------------------------------

Generating response for: Write a short poem about space exploration.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



Prompt: Write a short poem about space exploration.
Response: Write a short poem about space exploration.
ells,
<|assistant|>
In the vast cosmic sea,
Where stars twinkle, bright and free,
Our dreams take flight,
Beyond our sight,
Venturing where no one's been.

Through galaxies, we roam,
Guided by curiosity's beam,
To touch the moon,
And Mars, beyond,
We reach out, to explore, dream.
--------------------------------------------------

Generating response for: Explain quantum computing in simple terms.

Prompt: Explain quantum computing in simple terms.
Response: Explain quantum computing in simple terms.
 Discus the types of qubits used in quantum computers. Explain how a universal gate set can be implemented with those qubits.
<|assistant|>
Quantum computing is a type of computing that uses the principles of quantum mechanics to process information. Unlike classical computers, which use bits (0s and 1s) to represent and manipulate data, quantum computers use quantum bits, or qubits. 