# Qwen

In [1]:
import time
import textworld
import numpy as np
import re

In [2]:
import torch
import accelerate
torch.set_default_device('cuda')
torch.cuda.device("cuda")
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision('high')

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-1.7B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [4]:
print(model.hf_device_map)

{'model.embed_tokens': 0, 'lm_head': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 0, 'model.layers.17': 0, 'model.layers.18': 0, 'model.layers.19': 0, 'model.layers.20': 0, 'model.layers.21': 0, 'model.layers.22': 0, 'model.layers.23': 0, 'model.layers.24': 'cpu', 'model.layers.25': 'cpu', 'model.layers.26': 'cpu', 'model.layers.27': 'cpu', 'model.norm': 'cpu', 'model.rotary_emb': 'cpu'}


In [5]:
think_command_id = 151668

In [7]:
# prepare the model input
prompt = "Explain what the optimal strategy for winning Space Invaders is in one sentence."
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
start = time.time()
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32000
)
end = time.time()
print(f"(inference took {(end - start):.3f} seconds)")

output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

# parsing thinking content
try:
    # index finding </think>
    index = len(output_ids) - output_ids[::-1].index(think_command_id)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)

(inference took 5.292 seconds)
thinking content: 
content: The optimal strategy for winning Space Invaders is to shoot the invaders when they are in the top row, avoid being hit by their fire, and shoot them when they are in the middle of their formation to maximize damage while minimizing risk.


In [8]:
print(tokenizer.decode(generated_ids[0], skip_special_tokens = False))

<|im_start|>user
Explain what the optimal strategy for winning Space Invaders is in one sentence.<|im_end|>
<|im_start|>assistant
<think>

</think>

The optimal strategy for winning Space Invaders is to shoot the invaders when they are in the top row, avoid being hit by their fire, and shoot them when they are in the middle of their formation to maximize damage while minimizing risk.<|im_end|>


In [None]:
print(tokenizer.chat_template)

## Context size and shifting window

In [9]:
token_system = "<|im_start|>system\n"
token_endofturn = "<|im_end|>"
token_user = "<|im_start|>user\n"
token_assistant = "<|im_start|>assistant\n"

In [45]:
!tw-make custom --world-size 2 --quest-length 3 --nb-objects 10 --output tw_games/game.ulx -f -v --seed 123

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Global seed: 123
Game generated: /Main/LLM-PTG/Code/Playground/tw_games/game.ulx


In [46]:
import textworld.gym
env_id = textworld.gym.register_game('tw_games/game.ulx')
print(env_id)

tw-v15


In [47]:
env = textworld.gym.make(env_id)

In [15]:
system_prompt = """
You are an assistant playing a textual game.
You analyze the information given carefully and reply in the form \"verb noun\", e.g. \"open box\" or \"take key\".
/no_think
"""
context = token_system + system_prompt + token_endofturn

try:
    done = False
    env.reset()
    while not done:
        game_status = env.render(mode="text")
        print(game_status)
        context += token_user + game_status + token_endofturn + token_assistant
        
        start = time.time()    
        input_ids = tokenizer.encode(
            context,
            return_tensors="pt")
        
        output = model.generate(
            input_ids.to("cuda"),
            max_new_tokens=100,
            eos_token_id=tokenizer.eos_token_id
            )

        response = tokenizer.decode(output[0, input_ids.shape[1]:], skip_special_tokens=True).replace("<think>", "").replace("</think>", "").strip("\n")
        context += response + token_endofturn
        print(response)
        
        end = time.time()
        print(f"Inference took {(end - start):.3f} seconds")
        
        command = response if len(response.split()) <= 5 else "look around"
        game_state, score, done, infos = env.step(command)

    env.render()  # Final message.
except KeyboardInterrupt:
    pass  # Quit the game.

print("Played {} steps, scoring {} points.".format(game_state.moves, game_state.score))




                    ________  ________  __    __  ________        
                   |        \|        \|  \  |  \|        \       
                    \$$$$$$$$| $$$$$$$$| $$  | $$ \$$$$$$$$       
                      | $$   | $$__     \$$\/  $$   | $$          
                      | $$   | $$  \     >$$  $$    | $$          
                      | $$   | $$$$$    /  $$$$\    | $$          
                      | $$   | $$_____ |  $$ \$$\   | $$          
                      | $$   | $$     \| $$  | $$   | $$          
                       \$$    \$$$$$$$$ \$$   \$$    \$$          
              __       __   ______   _______   __        _______  
             |  \  _  |  \ /      \ |       \ |  \      |       \ 
             | $$ / \ | $$|  $$$$$$\| $$$$$$$\| $$      | $$$$$$$\
             | $$/  $\| $$| $$  | $$| $$__| $$| $$      | $$  | $$
             | $$  $$$\ $$| $$  | $$| $$    $$| $$      | $$  | $$
             | $$ $$\$$\$$| $$  | $$| $$$$$$$\| $$      | $

AttributeError: 'str' object has no attribute 'moves'

# Test with cumulative score

In [60]:
import textworld

# create a game
!tw-make tw-simple --rewards dense --goal detailed --seed 18 --test --silent -f --output games/test-game.z8

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [61]:
# create a play function for playing + recording scores

import os
from glob import glob

import textworld.gym

import torch


def play(agent, path, max_step=100, nb_episodes=10, verbose=True):
    torch.manual_seed(46)  # For reproducibility when using action sampling.

    infos_to_request = agent.infos_to_request
    infos_to_request.max_score = True  # Needed to normalize the scores.

    gamefiles = [path]
    if os.path.isdir(path):
        gamefiles = glob(os.path.join(path, "*.z8"))

    env_id = textworld.gym.register_games(gamefiles,
                                          request_infos=infos_to_request,
                                          max_episode_steps=max_step)
    env = textworld.gym.make(env_id)  # Create a Gym environment to play the text game.
    if verbose:
        if os.path.isdir(path):
            print(os.path.dirname(path), end="")
        else:
            print(os.path.basename(path), end="")

    # Collect some statistics: nb_steps, final reward.
    avg_moves, avg_scores, avg_norm_scores = [], [], []
    for no_episode in range(nb_episodes):
        obs, infos = env.reset()  # Start new episode.

        score = 0
        done = False
        nb_moves = 0
        while not done:
            command = agent.act(obs, score, done, infos)
            obs, score, done, infos = env.step(command)
            nb_moves += 1

        agent.act(obs, score, done, infos)  # Let the agent know the game is done.

        if verbose:
            print(".", end="")
        avg_moves.append(nb_moves)
        avg_scores.append(score)
        avg_norm_scores.append(score / infos["max_score"])

    env.close()
    if verbose:
        if os.path.isdir(path):
            msg = "  \tavg. steps: {:5.1f}; avg. normalized score: {:4.1f} / {}."
            print(msg.format(np.mean(avg_moves), np.mean(avg_norm_scores), 1))
        else:
            msg = "  \tavg. steps: {:5.1f}; avg. score: {:4.1f} / {}."
            print(msg.format(np.mean(avg_moves), np.mean(avg_scores), infos["max_score"]))

In [66]:
# create agents

from typing import Mapping, Any
import numpy as np
import textworld.gym

class RandomAgent(textworld.gym.Agent):
    """ Agent that randomly selects a command from the admissible ones. """
    def __init__(self, seed=1234):
        self.seed = seed
        self.rng = np.random.RandomState(self.seed)

    @property
    def infos_to_request(self) -> textworld.EnvInfos:
        return textworld.EnvInfos(admissible_commands=True)

    def act(self, obs: str, score: int, done: bool, infos: Mapping[str, Any]) -> str:
        return self.rng.choice(infos["admissible_commands"])

class HFAgent(textworld.gym.Agent):
    """LLM from HuggingFace that acts as an agent."""
    model = None
    tokenizer = None
    context = ""
    token_system = "<|im_start|>system\n"
    token_endofturn = "<|im_end|>\n"
    token_user = "<|im_start|>user\n"
    token_assistant = "<|im_start|>assistant\n"
    system_prompt = """
You are an assistant playing a textual game.
The user gives you information on the environment and you reply exclusively in the form \"verb noun\", like \"open box\" or \"take key\".
/no_think
"""
    first_move = False
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.initialize_context()

    def initialize_context(self):
        self.context = self.token_system + self.system_prompt + self.token_endofturn
        self.first_move = True

    @property
    def infos_to_request(self) -> textworld.EnvInfos:
        return textworld.EnvInfos(admissible_commands=True)

    def act(self, obs: str, score: int, done: bool, infos: Mapping[str, Any]) -> str:

        if done:
            self.initialize_context() # resets context
        if first_move:
            first_move = False
            return "help"
        
        try:
            self.context += self.token_user + obs + self.token_endofturn
            self.context += self.token_assistant # induces model to generate answer
            
            input_ids = self.tokenizer.encode(
                self.context,
                return_tensors = "pt")
            
            generated_ids = self.model.generate(
                input_ids.to("cuda"),
                max_new_tokens = 100,
                eos_token_id = self.tokenizer.eos_token_id
                )
            output_ids = generated_ids[0][len(input_ids[0]):].tolist() 
            
            # parsing thinking content
            try:
                # index finding </think>
                index = len(output_ids) - output_ids[::-1].index(think_command_id)
            except ValueError:
                index = 0
            response = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
            
            self.context += response + self.token_endofturn

            if len(response.split()) <= 6:
                command = response
            else:
                command = "look"
                print("looking around...")
            return command
            
        except KeyboardInterrupt:
            pass  # Try stopping the game prematurely.

In [67]:
# make the random agent play
nb_episodes = 10
max_steps = 50

play(RandomAgent(), "./games/test-game.z8", max_steps = 50)

test-game.z8..........  	avg. steps: 100.0; avg. score:  4.2 / 10.


In [68]:
# make Qwen play

start = time.time()
play(HFAgent(model, tokenizer), "./games/test-game.z8", max_steps=max_steps, nb_episodes=nb_episodes)
end = time.time()
print(f"Model took {((end - start)/60):.3f} min to play {nb_episodes} games")

test-game.z8..........  	avg. steps: 100.0; avg. score:  1.2 / 10.
--- Model took 35.755 min to play 10 games
