#CafChem Teaching - A chatbot using OpenAI's open weights model, GPT-OSS-20B.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MauricioCafiero/CafChemTeach/blob/main/notebooks/OpenAI_Chatbot_CafChem.ipynb)

## This notebook allows you to:
- Download the weights for the OpenAI GPT-OSS-20B model.
- Interact with the model via a chatbot.

## Requirements:
- If using on Colab, it will (un)install all needed libraries.
- Access to your HF token
- Needs a GPU, L4 or higher preferably (will be quite slow on T4).

## Set-up

The OpenAI open models have quite complicated dependencies, and so several packages have to be install/un-installed

In [None]:
 !pip install -q --upgrade torch
 !pip install openai-harmony

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m888.1/888.1 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m123.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m706.8/706.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.1/193.1 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install -q git+https://github.com/huggingface/transformers triton==3.4 git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Building wheel for triton_kernels (pyproject.toml) ... [?25l[?25hdone


In [None]:
!pip uninstall -q torchvision torchaudio -y

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl (72.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.1


In [None]:
pip install kernels

Collecting kernels
  Downloading kernels-0.9.0-py3-none-any.whl.metadata (3.3 kB)
Downloading kernels-0.9.0-py3-none-any.whl (37 kB)
Installing collected packages: kernels
Successfully installed kernels-0.9.0


In [None]:
import os
import re
import transformers
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import Mxfp4Config
import triton
import kernels

## Functions
- These functions set-up the model and power the chatbot

In [None]:
chat_history = []
global last_id
last_id = None

def clear_history():
        global chat_history
        chat_history = []
        global last_id
        last_id = None

def setup_free_openai():
  '''
  '''
  model_id = "openai/gpt-oss-20b"
  tokenizer = AutoTokenizer.from_pretrained(model_id)

  quantization_config = Mxfp4Config(dequantize=False)

  model = AutoModelForCausalLM.from_pretrained(
          model_id,
          device_map="cuda",
          torch_dtype="auto",
          quantization_config = quantization_config)

  return tokenizer, model

def chat(prompt, reason):
  '''
  '''
  global last_id
  global chat_history
  chat_history.append({"role": "user", "content": prompt, "reasoning_effort": reason})

  inputs = tokenizer.apply_chat_template(
                      chat_history, #messages,
                      add_generation_prompt=True,
                      return_tensors="pt",
                      return_dict=True).to(model.device)

  generated = model.generate(**inputs, max_new_tokens=1000)
  response = tokenizer.decode(generated[0][inputs["input_ids"].shape[-1]:])

  start_match = "\<\|message\|\>"
  response_parts = re.split(start_match, response)
  answer = response_parts[-1].replace('<|return|>','')
  reasoning = response_parts[-2].replace('<|return|>','')

  chat_history.append(
              {"role": "assistant", "content": answer, "reasoning": reasoning}
  )
  print(answer)


  return "", chat_history, reasoning


def chatbot():
  '''
  '''
  with gr.Blocks() as forest:
    gr.Markdown(
        """
        # Chat with OpenAI GPT-OSS-20B.
        - If using high reasoning, responses can take a minute or longer!
        ### Enter your messages below.
        """)

    reason = gr.Radio(choices = ["low", "medium", "high"],label="Reasoning Level",
                    value = "medium", interactive=True)


    chatbot = gr.Chatbot(type="messages")
    msg = gr.Textbox(label="Type your messages here and hit enter.")


    reasoning_output = gr.Textbox(label="Reasoning", value="")

    chat_btn = gr.Button(value = "Send")

    clear = gr.ClearButton([msg, chatbot, reasoning_output])
    clear.click(clear_history)


    chat_btn.click(chat, [msg, reason], [msg, chatbot, reasoning_output])
    msg.submit(chat, [msg, reason], [msg, chatbot, reasoning_output])


  forest.launch(share=True)

## Download the model weights and tokenizer from HuggingFace

In [None]:
tokenizer, model = setup_free_openai()

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/27.9M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00000-of-00002.safetensors:   0%|          | 0.00/4.79G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.80G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.17G [00:00<?, ?B/s]

Fetching 40 files:   0%|          | 0/40 [00:00<?, ?it/s]

__init__.py:   0%|          | 0.00/179 [00:00<?, ?B/s]

_finalize_matmul.py: 0.00B [00:00, ?B/s]

_masked_compaction.py:   0%|          | 0.00/814 [00:00<?, ?B/s]

_common.py: 0.00B [00:00, ?B/s]

matmul_ogs.py: 0.00B [00:00, ?B/s]

__init__.cpython-312.pyc:   0%|          | 0.00/220 [00:00<?, ?B/s]

_ops.py:   0%|          | 0.00/201 [00:00<?, ?B/s]

compaction.py: 0.00B [00:00, ?B/s]

_matmul_ogs.py: 0.00B [00:00, ?B/s]

_p_matmul_ogs.py: 0.00B [00:00, ?B/s]

opt_flags.py: 0.00B [00:00, ?B/s]

opt_flags_amd.py: 0.00B [00:00, ?B/s]

opt_flags_nvidia.py: 0.00B [00:00, ?B/s]

numerics.py: 0.00B [00:00, ?B/s]

flexpoint.py: 0.00B [00:00, ?B/s]

mxfp.py: 0.00B [00:00, ?B/s]

_downcast_to_mxfp.py: 0.00B [00:00, ?B/s]

_upcast_from_mxfp.py: 0.00B [00:00, ?B/s]

reduce_bitmatrix.py: 0.00B [00:00, ?B/s]

routing.py: 0.00B [00:00, ?B/s]

_expt_data.py: 0.00B [00:00, ?B/s]

_routing_compute.py: 0.00B [00:00, ?B/s]

proton_opts.py:   0%|          | 0.00/456 [00:00<?, ?B/s]

specialize.py: 0.00B [00:00, ?B/s]

swiglu.py: 0.00B [00:00, ?B/s]

_swiglu.py: 0.00B [00:00, ?B/s]

target_info.py: 0.00B [00:00, ?B/s]

layout.py: 0.00B [00:00, ?B/s]

tensor.py: 0.00B [00:00, ?B/s]

base.py:   0%|          | 0.00/352 [00:00<?, ?B/s]

hopper_scale.py: 0.00B [00:00, ?B/s]

hopper_value.py: 0.00B [00:00, ?B/s]

strided.py:   0%|          | 0.00/337 [00:00<?, ?B/s]

testing.py: 0.00B [00:00, ?B/s]

_topk_backward.py: 0.00B [00:00, ?B/s]

blackwell_scale.py: 0.00B [00:00, ?B/s]

topk.py: 0.00B [00:00, ?B/s]

_topk_forward.py: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/165 [00:00<?, ?B/s]

## App
- to run fullscreen click on the link below (after * Running on public URL:)
- you can also share this link with others; it will work as long as your notebook is running

In [None]:
chatbot()

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9a445b0e0782b74a5c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
