# --> 1.) Initial LLM: Without Fine Tuning

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer

In [2]:
model_name = "Qwen/Qwen2-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="sdpa"
)
model.eval()

Some parameters are on the meta device because they were offloaded to the cpu.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotary_emb): Qw

In [3]:
def qwen_chatbot_stream(messages, max_new_tokens=512, temperature=0.7, top_k=50, top_p=0.95):

    """
    Streaming chatbot using Qwen2-1.5B-Instruct and Hugging Face TextStreamer.

    Parameters:
        messages (list): List of message dicts (system, user, assistant roles).
        max_new_tokens (int): Maximum tokens to generate.
        temperature (float): Sampling temperature.
        top_k (int): Top-k sampling.
        top_p (float): Top-p sampling.

    Output:
        Streams assistant's reply to stdout.
    """

    prompt_text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = tokenizer([prompt_text], return_tensors="pt").to("cuda")

    streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)

    with torch.no_grad():
        _ = model.generate(
            inputs.input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            streamer=streamer
        )

In [4]:
if __name__ == "__main__":

    print("**Qwen Streaming Chatbot**")
    print("Type 'exit' to quit.\n")

    chat_history = [{"role": "system", "content": "You are a helpful assistant."}]

    while True:
        user_input = input("User: ")
        print("User: ", user_input)

        if user_input.strip().lower() in ['exit', 'quit']:
            print("Goodbye..")
            break

        chat_history.append({"role": "user", "content": user_input})
        print("Assistant: ", end=" ", flush=True)
        qwen_chatbot_stream(chat_history)
        print()

**Qwen Streaming Chatbot**
Type 'exit' to quit.

User:  Hello! Tell me about your expertise in about 100 words
Assistant:  

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


As an AI language model, my main expertise is in natural language processing and understanding human language. I am designed to analyze and process text data, including text from online sources, chat conversations, and other written material, to extract meaning and summarize information. My ability to understand context and relationships between sentences makes me useful for a wide range of tasks, including answering questions, generating responses to prompts, and even providing explanations for complex concepts or ideas.

User:  exit
Goodbye..
