In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor
from transformers import BitsAndBytesConfig
from pathlib import Path
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

model_name = "Qwen/Qwen3-4B-Instruct-2507"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_type=torch.bfloat16
)

try:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cuda",
        quantization_config=bnb_config,
        trust_remote_code=True
    ).eval()

    print("Model loaded successfully")
except Exception as e:
    print(f"Error loading model: {e}")
    model = None

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    print("Tokenizer loaded successfully")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    tokenizer = None

try:
    processor = AutoProcessor.from_pretrained(model_name)
    print("Processor loaded successfully")
except Exception as e:
    print(f"Error loading processor: {e}")
    processor = None

Loading checkpoint shards: 100%|██████████| 3/3 [00:52<00:00, 17.50s/it]


Model loaded successfully
Tokenizer loaded successfully
Processor loaded successfully


In [3]:
torch.compile(model)

OptimizedModule(
  (_orig_mod): Qwen3ForCausalLM(
    (model): Qwen3Model(
      (embed_tokens): Embedding(151936, 2560)
      (layers): ModuleList(
        (0-35): 36 x Qwen3DecoderLayer(
          (self_attn): Qwen3Attention(
            (q_proj): Linear4bit(in_features=2560, out_features=4096, bias=False)
            (k_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
            (v_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
            (o_proj): Linear4bit(in_features=4096, out_features=2560, bias=False)
            (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
            (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
          )
          (mlp): Qwen3MLP(
            (gate_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
            (up_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
            (down_proj): Linear4bit(in_features=9728, out_features=2560, bias=False)
            (act_fn): SiLUActivation()
     

In [4]:
SYSTEM_PROMPT = """You are a helpful assistant that provides detailed and informative answers to user queries.
When responding, ensure that your answers are clear and easy to understand.'"""

def chat_with_ai(model, tokenizer, processor):
    # Initialize conversation history
    conversation = [
        {"role": "system", "content": SYSTEM_PROMPT}
    ]

    print("Chat initialized. Type 'exit' to end the conversation.")

    while True:
        # Get user input
        user_message = input("You: ")

        if user_message.lower() == 'exit':
            break

        # Add user message to conversation history
        conversation.append({"role": "user", "content": user_message})

        # Apply chat template with proper tokenization
        inputs = tokenizer.apply_chat_template(
            conversation,
            tokenize=True,
            add_generation_prompt=True,
            enable_thinking=True,
            return_tensors="pt"
        )

        # Move inputs to device and create attention mask
        input_ids = inputs.to("cuda")
        attention_mask = torch.ones_like(input_ids).to("cuda")
        prompt_length = input_ids.shape[1]

        # Generate response and measure time
        start_time = time.time()
        with torch.no_grad():
            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=8192,
                temperature=0.7,
                top_p=0.7,
                repetition_penalty=1.2,
                do_sample=True,
                use_cache=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        end_time = time.time()

        # Extract just the new tokens
        new_tokens = output[0][prompt_length:]
        assistant_response = tokenizer.decode(new_tokens, skip_special_tokens=True)

        # Print the response
        print(f"AI: {assistant_response}")

        # Calculate and print tokens per second
        num_tokens = new_tokens.shape[0] if hasattr(new_tokens, 'shape') else len(new_tokens)
        elapsed = end_time - start_time
        tps = num_tokens / elapsed if elapsed > 0 else 0
        print(f"Tokens generated: {num_tokens} | Time: {elapsed:.2f}s | Tokens/sec: {tps:.2f}")

        # Add assistant response to conversation history
        conversation.append({"role": "assistant", "content": assistant_response})


In [5]:
if model is None or tokenizer is None:
    print("Error: Model or tokenizer failed to load. Please check the error messages above and verify:")
    print("1. The model path exists and is correct")
    print(f"2. You have sufficient GPU memory (this model requires ~{15} GB VRAM)")
    print("3. The model files are not corrupted")
else:
    chat_with_ai(model, tokenizer, processor)


Chat initialized. Type 'exit' to end the conversation.
AI: We want to compute the integral:

$$
\int \frac{\arctan(x)}{x^2} \, dx
$$

---

### Step 1: Use Integration by Parts

Let’s use **integration by parts**, which is useful when we have a product of functions — one of them being an inverse trig function like $\arctan x$, and another simple function.

Recall:
$$
\int u \, dv = uv - \int v \, du
$$

Set:
- $u = \arctan(x)$ → then $du = \dfrac{1}{1+x^2}\,dx$
- $dv = \dfrac{1}{x^2} \, dx$ → so $v = \int x^{-2} \, dx = -\dfrac{1}{x}$

Now apply integration by parts:

$$
\int \frac{\arctan(x)}{x^2} \, dx = -\frac{\arctan(x)}{x} + \int \frac{1}{x(1+x^2)} \, dx
$$

So now our problem reduces to evaluating:
$$
\int \frac{1}{x(1+x^2)} \, dx
$$

---

### Step 2: Partial Fraction Decomposition

Decompose:
$$
\frac{1}{x(1+x^2)} = \frac{A}{x} + \frac{Bx + C}{1+x^2}
$$

Multiply both sides by $x(1+x^2)$:

$$
1 = A(1+x^2) + (Bx+C)(x)
$$

Expand right-hand side:
$$
1 = A + Ax^2 + Bx^2 + Cx = A + (