<a href="https://colab.research.google.com/github/Manoj-11-Dahal/oxen-website/blob/master/Welcome_to_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ============================================
# CELL 1: Check GPU & System Info
# ============================================
!nvidia-smi
print("\n" + "="*50)
!cat /proc/meminfo | grep MemTotal
!df -h | grep /dev/sda1

/bin/bash: line 1: nvidia-smi: command not found

MemTotal:       13286956 kB
/dev/sda1        57G   23G   35G  39% /kaggle/input


In [3]:
# ============================================
# CELL 2: Install Dependencies (FIXED)
# ============================================

# Step 1: Install build tools first
!apt-get update
!apt-get install -y build-essential cmake

# Step 2: Install basic packages
!pip install modelscope gradio huggingface_hub -q

# Step 3: Install llama-cpp-python with CUDA (FIXED METHOD)
!pip uninstall llama-cpp-python -y 2>/dev/null

# Method A: Pre-built wheel (FASTEST - Try this first)
!pip install llama-cpp-python \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121

print("‚úÖ Installation complete!")

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 https://cli.github.com/packages stable/main amd64 Packages [356 B]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [83.8 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [61.5 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,639 kB]
Get:14 https://

In [None]:
# ============================================
# CELL 3: Download Model from ModelScope
# ============================================
from modelscope import snapshot_download
import os

print("‚¨áÔ∏è Downloading model... This may take 10-30 minutes")

model_dir = snapshot_download(
    'TeichAI/Qwen3-14B-Claude-4.5-Opus-High-Reasoning-Distill-GGUF',
    local_dir='./model',
    revision='master'
)

print(f"\n‚úÖ Model downloaded to: {model_dir}")
print("\nüìÅ Files:")
!ls -lh ./model/

‚¨áÔ∏è Downloading model... This may take 10-30 minutes
Downloading Model from https://www.modelscope.cn to directory: /content/model


2026-01-22 12:38:37,924 - modelscope - INFO - Got 9 files, start to download ...


Processing 9 items:   0%|          | 0.00/9.00 [00:00<?, ?it/s]

Downloading [Qwen3-14B-Claude-4.5-Opus-Distill.bf16.gguf]:   0%|          | 0.00/27.5G [00:00<?, ?B/s]

Downloading [Qwen3-14B-Claude-4.5-Opus-Distill.q3_k_s.gguf]:   0%|          | 0.00/6.20G [00:00<?, ?B/s]

Downloading [Qwen3-14B-Claude-4.5-Opus-Distill.iq4_nl.gguf]:   0%|          | 0.00/8.01G [00:00<?, ?B/s]

Downloading [Qwen3-14B-Claude-4.5-Opus-Distill.q3_k_m.gguf]:   0%|          | 0.00/6.82G [00:00<?, ?B/s]

Downloading [Qwen3-14B-Claude-4.5-Opus-Distill.q4_k_m.gguf]:   0%|          | 0.00/8.38G [00:00<?, ?B/s]

Downloading [Qwen3-14B-Claude-4.5-Opus-Distill.q8_0.gguf]:   0%|          | 0.00/14.6G [00:00<?, ?B/s]

Downloading [qwen3-14b.F16.gguf]:   0%|          | 0.00/27.5G [00:00<?, ?B/s]

In [None]:
# ============================================
# CELL 4: Find and Select GGUF File
# ============================================
import os
import glob

gguf_files = glob.glob('./model/*.gguf')

if not gguf_files:
    gguf_files = glob.glob('./model/**/*.gguf', recursive=True)

print("üì¶ Available GGUF files:\n")
for i, f in enumerate(gguf_files):
    size = os.path.getsize(f) / (1024**3)
    print(f"  [{i}] {os.path.basename(f)} ({size:.2f} GB)")

# Select model (choose smaller quantization for free Colab)
# Q4_K_M recommended for T4 GPU
MODEL_PATH = gguf_files[0]  # Change index if needed

print(f"\n‚úÖ Selected: {MODEL_PATH}")

In [None]:
# ============================================
# CELL 5: Load Model into Memory
# ============================================
from llama_cpp import Llama

print("üîÑ Loading model... Please wait (2-5 minutes)")

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=4096,              # Context window
    n_gpu_layers=-1,         # Use all GPU layers (-1 = all)
    n_threads=4,             # CPU threads
    n_batch=512,             # Batch size
    verbose=False
)

print("‚úÖ Model loaded successfully!")
print(f"üìä Context size: 4096 tokens")

In [None]:
# ============================================
# CELL 6: Quick Test
# ============================================
print("üß™ Testing model...\n")

test_response = llm.create_chat_completion(
    messages=[
        {"role": "user", "content": "Hello! Who are you?"}
    ],
    max_tokens=100,
    temperature=0.7
)

print("Response:", test_response['choices'][0]['message']['content'])
print("\n‚úÖ Model is working!")

In [None]:
# ============================================
# CELL 7: Full Gradio Web Interface
# ============================================
import gradio as gr

def generate_response(message, history, system_prompt, temperature, max_tokens, top_p):
    """Generate streaming response"""
    messages = []

    # Add system prompt
    if system_prompt and system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt})

    # Add conversation history
    for human, assistant in history:
        if human:
            messages.append({"role": "user", "content": human})
        if assistant:
            messages.append({"role": "assistant", "content": assistant})

    # Add current message
    messages.append({"role": "user", "content": message})

    # Generate with streaming
    response = llm.create_chat_completion(
        messages=messages,
        temperature=temperature,
        max_tokens=int(max_tokens),
        top_p=top_p,
        stream=True
    )

    partial_message = ""
    for chunk in response:
        if 'choices' in chunk and len(chunk['choices']) > 0:
            delta = chunk['choices'][0].get('delta', {})
            if 'content' in delta:
                partial_message += delta['content']
                yield partial_message


def clear_conversation():
    return None, ""


# Build UI
with gr.Blocks(
    title="Qwen3-14B Chat",
    theme=gr.themes.Soft(),
    css="""
    .container { max-width: 900px; margin: auto; }
    footer { display: none !important; }
    """
) as demo:

    gr.Markdown("""
    # ü§ñ Qwen3-14B Chat
    ### Running on Google Colab with GPU Acceleration
    ---
    """)

    with gr.Row():
        # Main chat area
        with gr.Column(scale=4):
            chatbot = gr.Chatbot(
                height=500,
                show_label=False,
                avatar_images=("üë§", "ü§ñ"),
                bubble_full_width=False
            )

            with gr.Row():
                msg = gr.Textbox(
                    label="Your Message",
                    placeholder="Type your message here... (Press Enter to send)",
                    lines=2,
                    scale=4
                )
                submit_btn = gr.Button("Send üì§", variant="primary", scale=1)

            with gr.Row():
                clear_btn = gr.Button("üóëÔ∏è Clear Chat")
                regenerate_btn = gr.Button("üîÑ Regenerate")

        # Settings panel
        with gr.Column(scale=1):
            gr.Markdown("### ‚öôÔ∏è Settings")

            system_prompt = gr.Textbox(
                label="System Prompt",
                value="You are a helpful, harmless, and honest AI assistant. Provide detailed and accurate responses.",
                lines=4,
                placeholder="Enter system instructions..."
            )

            temperature = gr.Slider(
                minimum=0.1,
                maximum=2.0,
                value=0.7,
                step=0.1,
                label="üå°Ô∏è Temperature",
                info="Higher = more creative"
            )

            max_tokens = gr.Slider(
                minimum=64,
                maximum=4096,
                value=1024,
                step=64,
                label="üìè Max Tokens",
                info="Maximum response length"
            )

            top_p = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="üéØ Top P",
                info="Nucleus sampling"
            )

            gr.Markdown("""
            ---
            ### üìä Model Info
            - **Model:** Qwen3-14B
            - **Format:** GGUF
            - **Backend:** llama.cpp
            - **GPU:** CUDA
            """)

    # Example prompts
    gr.Markdown("### üí° Example Prompts")
    with gr.Row():
        ex1 = gr.Button("Explain quantum computing", size="sm")
        ex2 = gr.Button("Write a Python function", size="sm")
        ex3 = gr.Button("Creative story idea", size="sm")
        ex4 = gr.Button("Debug my code", size="sm")

    # Event handlers
    def user_message(message, history):
        if not message.strip():
            return "", history
        return "", history + [[message, None]]

    def bot_response(history, system_prompt, temperature, max_tokens, top_p):
        if not history:
            return history
        message = history[-1][0]
        history[-1][1] = ""
        for chunk in generate_response(message, history[:-1], system_prompt, temperature, max_tokens, top_p):
            history[-1][1] = chunk
            yield history

    def regenerate(history, system_prompt, temperature, max_tokens, top_p):
        if not history:
            return history
        # Remove last response
        history[-1][1] = ""
        for chunk in generate_response(history[-1][0], history[:-1], system_prompt, temperature, max_tokens, top_p):
            history[-1][1] = chunk
            yield history

    def set_example(example_text):
        return example_text

    # Wire up events
    msg.submit(
        user_message, [msg, chatbot], [msg, chatbot]
    ).then(
        bot_response, [chatbot, system_prompt, temperature, max_tokens, top_p], chatbot
    )

    submit_btn.click(
        user_message, [msg, chatbot], [msg, chatbot]
    ).then(
        bot_response, [chatbot, system_prompt, temperature, max_tokens, top_p], chatbot
    )

    clear_btn.click(lambda: (None, ""), outputs=[chatbot, msg])

    regenerate_btn.click(
        regenerate, [chatbot, system_prompt, temperature, max_tokens, top_p], chatbot
    )

    # Example buttons
    ex1.click(lambda: "Explain quantum computing in simple terms", outputs=msg)
    ex2.click(lambda: "Write a Python function to find prime numbers", outputs=msg)
    ex3.click(lambda: "Give me a creative story idea about time travel", outputs=msg)
    ex4.click(lambda: "Help me debug this code: def add(a, b): return a - b", outputs=msg)

print("üöÄ Launching web interface...")
print("="*50)

demo.launch(
    share=True,           # Creates public URL
    debug=True,
    server_name="0.0.0.0",
    server_port=7860,
    show_error=True
)

In [None]:
# ============================================
# CELL 8: Simple Chat (Alternative)
# ============================================
import gradio as gr

def simple_chat(message, history):
    messages = []
    for h in history:
        messages.append({"role": "user", "content": h[0]})
        if h[1]:
            messages.append({"role": "assistant", "content": h[1]})
    messages.append({"role": "user", "content": message})

    response = llm.create_chat_completion(
        messages=messages,
        temperature=0.7,
        max_tokens=1024,
        stream=True
    )

    partial = ""
    for chunk in response:
        if chunk['choices'][0].get('delta', {}).get('content'):
            partial += chunk['choices'][0]['delta']['content']
            yield partial

demo = gr.ChatInterface(
    simple_chat,
    title="ü§ñ Qwen3-14B Chat",
    description="Simple chat interface",
    theme="soft",
    examples=[
        "Hello, who are you?",
        "Explain machine learning",
        "Write a haiku about coding"
    ],
    retry_btn="üîÑ Retry",
    undo_btn="‚Ü©Ô∏è Undo",
    clear_btn="üóëÔ∏è Clear"
)

demo.launch(share=True)

In [None]:
# ============================================
# CELL 9: API Mode with FastAPI + Ngrok
# ============================================
!pip install fastapi uvicorn pyngrok nest_asyncio -q

import nest_asyncio
nest_asyncio.apply()

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import uvicorn
from pyngrok import ngrok
import threading

app = FastAPI(title="Qwen3-14B API")

class Message(BaseModel):
    role: str
    content: str

class ChatRequest(BaseModel):
    messages: List[Message]
    temperature: Optional[float] = 0.7
    max_tokens: Optional[int] = 1024
    top_p: Optional[float] = 0.95

class ChatResponse(BaseModel):
    response: str
    usage: dict

@app.get("/")
def root():
    return {"status": "online", "model": "Qwen3-14B-GGUF"}

@app.get("/health")
def health():
    return {"status": "healthy"}

@app.post("/v1/chat/completions", response_model=ChatResponse)
def chat(request: ChatRequest):
    try:
        messages = [{"role": m.role, "content": m.content} for m in request.messages]

        response = llm.create_chat_completion(
            messages=messages,
            temperature=request.temperature,
            max_tokens=request.max_tokens,
            top_p=request.top_p
        )

        return ChatResponse(
            response=response['choices'][0]['message']['content'],
            usage=response.get('usage', {})
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# Start ngrok tunnel
public_url = ngrok.connect(8000)
print(f"\nüåê Public API URL: {public_url}")
print(f"\nüìñ API Docs: {public_url}/docs")
print("\n" + "="*50)
print("Example usage:")
print(f"""
curl -X POST {public_url}/v1/chat/completions \\
  -H "Content-Type: application/json" \\
  -d '{{"messages": [{{"role": "user", "content": "Hello!"}}]}}'
""")

# Run server
uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
# ============================================
# CELL 10: Monitor GPU/CPU Resources
# ============================================
!pip install gputil psutil -q

import GPUtil
import psutil
import time
from IPython.display import clear_output

def monitor():
    while True:
        clear_output(wait=True)

        # GPU Info
        gpus = GPUtil.getGPUs()
        print("üñ•Ô∏è GPU Status:")
        print("="*50)
        for gpu in gpus:
            print(f"  Name: {gpu.name}")
            print(f"  Memory: {gpu.memoryUsed:.0f}MB / {gpu.memoryTotal:.0f}MB ({gpu.memoryUtil*100:.1f}%)")
            print(f"  GPU Load: {gpu.load*100:.1f}%")
            print(f"  Temperature: {gpu.temperature}¬∞C")

        # CPU/RAM Info
        print("\nüíª System Status:")
        print("="*50)
        print(f"  CPU Usage: {psutil.cpu_percent()}%")
        print(f"  RAM: {psutil.virtual_memory().used/1024**3:.1f}GB / {psutil.virtual_memory().total/1024**3:.1f}GB ({psutil.virtual_memory().percent}%)")

        print("\n‚è∞ Last updated:", time.strftime("%H:%M:%S"))
        print("\n[Press STOP to exit monitoring]")

        time.sleep(5)

# Uncomment to run monitoring
# monitor()

In [None]:
# ============================================
# CELL 11: Save/Load Chat History
# ============================================
import json
from datetime import datetime
from google.colab import files

def save_chat(history, filename=None):
    """Save chat history to JSON"""
    if filename is None:
        filename = f"chat_history_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

    data = {
        "timestamp": datetime.now().isoformat(),
        "model": "Qwen3-14B-GGUF",
        "messages": []
    }

    for human, assistant in history:
        data["messages"].append({"role": "user", "content": human})
        data["messages"].append({"role": "assistant", "content": assistant})

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"‚úÖ Saved to {filename}")
    files.download(filename)

def load_chat(filename):
    """Load chat history from JSON"""
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)

    history = []
    messages = data["messages"]
    for i in range(0, len(messages)-1, 2):
        history.append([messages[i]["content"], messages[i+1]["content"]])

    return history

# Example usage:
# save_chat(chatbot.value)  # After running Gradio

In [None]:
# ============================================
# QUICK START: All-in-One Cell
# ============================================

# 1. Install
!pip install modelscope gradio llama-cpp-python huggingface_hub -q
!CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --force-reinstall --no-cache-dir -q

# 2. Download
from modelscope import snapshot_download
model_dir = snapshot_download('TeichAI/Qwen3-14B-Claude-4.5-Opus-High-Reasoning-Distill-GGUF', local_dir='./model')

# 3. Load
import glob
from llama_cpp import Llama
MODEL_PATH = glob.glob('./model/*.gguf')[0]
llm = Llama(model_path=MODEL_PATH, n_ctx=4096, n_gpu_layers=-1)

# 4. Launch
import gradio as gr
def chat(message, history):
    messages = [{"role": "user" if i%2==0 else "assistant", "content": m}
                for h in history for i, m in enumerate([h[0], h[1]]) if m]
    messages.append({"role": "user", "content": message})
    r = llm.create_chat_completion(messages=messages, max_tokens=1024, stream=True)
    partial = ""
    for chunk in r:
        if chunk['choices'][0].get('delta', {}).get('content'):
            partial += chunk['choices'][0]['delta']['content']
            yield partial

gr.ChatInterface(chat, title="ü§ñ Qwen3-14B").launch(share=True)