In [1]:
import getpass
import os

# Enter your key
key = getpass.getpass('Enter your ANTHROPIC API key: ')

# Set the environment variable without displaying the full key
os.environ['ANTHROPIC_API_KEY'] = key

Enter your ANTHROPIC API key:  ········


In [2]:
%load_ext jupyter_ai

In [2]:
%load_ext jupyter_distributed

Jupyter Distributed extension with persistent processes loaded.


### Load data

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [5]:
data_path = "../data-ops/datasets/downloads/livecodebench-code_generation_lite--main-test6.jsonl"
dset = load_dataset("json", data_files=data_path)["train"]

In [16]:
system = "You are a helpful and harmless assistant."
prompt_template = """Write python code to solve the following problem:

{question_content}

Your solution should follow the format

```python
{starter_code}...
```"""

### Load model

In [3]:
%%distribute 8
import torch.distributed as dist
dist.destroy_process_group()

Distributing execution across 8 persistent processes...
Successfully executed in all 8 processes
Execution time: 1.39 seconds


In [4]:
%%distribute 8
WORLD_SIZE = 8
RANK = __process_id__
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch.distributed as dist
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
os.environ['LOCAL_RANK'] = str(RANK)
dist.init_process_group('nccl', rank=RANK, world_size=WORLD_SIZE)

Distributing execution across 8 persistent processes...
Successfully executed in all 8 processes
Execution time: 2.96 seconds


In [9]:
%%distribute 8
model_name = "Qwen/Qwen3-32B"
tp_plan = {
    "model.layers.*.self_attn.q_proj": "colwise",
    "model.layers.*.self_attn.k_proj": "colwise",
    "model.layers.*.self_attn.v_proj": "colwise",
    "model.layers.*.self_attn.o_proj": "rowwise",
    "model.layers.*.mlp.up_proj": "colwise",
    "model.layers.*.mlp.gate_proj": "colwise",
    "model.layers.*.mlp.down_proj": "rowwise",
    "lm_head": "colwise_rep",
}
model = AutoModelForCausalLM.from_pretrained(model_name, tp_plan=tp_plan)

Distributing execution across 8 persistent processes...
[Process 6] Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]
[Process 0] Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]
[Process 4] Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]
[Process 0] Loading checkpoint shards:   6%|5         | 1/17 [00:00<00:05,  3.11it/s]
[Process 1] Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]
[Process 3] Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]
[Process 6] Loading checkpoint shards:   6%|5         | 1/17 [00:00<00:05,  3.14it/s]
[Process 5] Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]
[Process 7] Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]
[Process 2] Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]
[Process 6] Loading checkpoint shards:  12%|#1        | 2/17 [00:00<00:03,  4.64it/s]
[Process 0] Loading checkpoint shards:  12%|#1        | 2/17 [

In [10]:
%%distribute 8
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt = "Can I help"
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=model.device)

Distributing execution across 8 persistent processes...
[Process 6] Device set to use cuda:6

[Process 7] Device set to use cuda:7

[Process 0] Device set to use cuda:0

[Process 2] Device set to use cuda:2

[Process 3] Device set to use cuda:3

[Process 4] Device set to use cuda:4

[Process 5] Device set to use cuda:5

[Process 1] Device set to use cuda:1

Successfully executed in all 8 processes
Execution time: 0.58 seconds


In [11]:
%%distribute 8
output = pipe(prompt, max_length=10, max_new_tokens=10)
print(output)

Distributing execution across 8 persistent processes...
[Process 0] Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

[Process 1] Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

[Process 2] Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max lengt

In [6]:
%%distribute 8
print('hello')

Distributing execution across 8 persistent processes...
[Process 0] hello
[Process 1] hello
[Process 2] hello
[Process 3] hello
[Process 4] hello
[Process 5] hello
[Process 6] hello
[Process 7] hello
[Process 0] 

[Process 1] 

[Process 2] 

[Process 3] 

[Process 4] 

[Process 5] 

[Process 6] 

[Process 7] 

Successfully executed in all 8 processes
Execution time: 0.01 seconds


In [3]:
%%distribute 8 --debug
print('hello')

Distributing execution across 8 persistent processes...
[MAIN] at 0:0.00000: Starting to send tasks to 8 workers

[MAIN] at 0:0.00011: Task sent to worker 0

[MAIN] at 0:0.00016: Task sent to worker 1

[MAIN] at 0:0.00020: Task sent to worker 2

[MAIN] at 0:0.00025: Task sent to worker 3

[MAIN] at 0:0.00030: Task sent to worker 4

[MAIN] at 0:0.00035: Task sent to worker 5

[MAIN] at 0:0.00039: Task sent to worker 6

[MAIN] at 0:0.00044: Task sent to worker 7

[MAIN] at 0:0.00047: Starting result collection

[Process 0] [DIAGNOSTIC] Process 0 at 0:4.61248: Received task, starting execution

[Process 1] [DIAGNOSTIC] Process 1 at 0:4.60700: Received task, starting execution

[Process 2] [DIAGNOSTIC] Process 2 at 0:4.60466: Received task, starting execution

[Process 3] [DIAGNOSTIC] Process 3 at 0:4.60637: Received task, starting execution

[Process 4] [DIAGNOSTIC] Process 4 at 0:4.59924: Received task, starting execution

[Process 5] [DIAGNOSTIC] Process 5 at 0:4.59880: Received task, s

In [1]:
%load_ext jupyter_distributed

Jupyter Distributed extension with persistent processes loaded.


In [2]:
%%distribute 2
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

Distributing execution across 2 persistent processes...
[Process 0] Worker process 0 started with PID 2914640

[Process 1] Worker process 1 started with PID 2914641

Successfully executed in all 2 processes
Execution time: 3.45 seconds


In [5]:
%%distribute 2
print(pipeline)

Distributing execution across 2 persistent processes...
[Process 0] <function pipeline at 0x70ea592858a0>
[Process 1] <function pipeline at 0x720bf0fa58a0>
[Process 0] 

[Process 1] 

Successfully executed in all 2 processes
Execution time: 0.00 seconds


In [4]:
%%distribute 2
import time
time.sleep(5.0)

Distributing execution across 2 persistent processes...

Execution interrupted by user. Sending interrupt to workers...
Process 0 interrupted successfully.
Process 1 interrupted successfully.
Execution time: 1.12 seconds


In [7]:
%%distribute 2
model2 = AutoModelForCausalLM.from_pretrained('Qwen/Qwen3-32B-AWQ', device_map=f'cuda:{__process_id__}')

Distributing execution across 2 persistent processes...
I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/


I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to th

In [8]:
%%distribute 2
print(model2)

Distributing execution across 2 persistent processes...
[Process 0] Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 5120)
    (layers): ModuleList(
      (0-63): 64 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): WQLinear_GEMM(in_features=5120, out_features=8192, bias=False, w_bit=4, group_size=128)
          (k_proj): WQLinear_GEMM(in_features=5120, out_features=1024, bias=False, w_bit=4, group_size=128)
          (v_proj): WQLinear_GEMM(in_features=5120, out_features=1024, bias=False, w_bit=4, group_size=128)
          (o_proj): WQLinear_GEMM(in_features=8192, out_features=5120, bias=False, w_bit=4, group_size=128)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): WQLinear_GEMM(in_features=5120, out_features=25600, bias=False, w_bit=4, group_size=128)
          (up_proj): WQLinear_GEMM(in_features=5120, out_featur