In [None]:
# Model selection
MODEL_NAME = "maryasov/qwen2.5-coder-cline:7b-instruct-q8_0"
%env OLLAMA_CONTEXT_LENGTH=16384
%env OLLAMA_HOST=0.0.0.0
%env OLLAMA_KEEP_ALIVE=-1


In [None]:
!apt-get install -y lshw pciutils
!nvcc --version
!nvidia-smi

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print(f"\n🧠 Available RAM: {ram_gb:.1f} GB")
print("✅ High-RAM runtime!" if ram_gb >= 20 else "❌ Not a high-RAM runtime.")


In [None]:
!curl -fsSL https://ollama.com/install.sh | sh


In [None]:
import subprocess
import time
import requests
import threading

# Start ollama serve in a background thread
def start_ollama():
    subprocess.call(['ollama', 'serve'])

ollama_thread = threading.Thread(target=start_ollama)
ollama_thread.daemon = True
ollama_thread.start()

# Pull model (this also verifies Ollama CLI is ready)
!ollama pull {MODEL_NAME}

# Wait for Ollama HTTP API to be ready
def wait_for_ollama(timeout=60):
    for i in range(timeout):
        try:
            r = requests.get("http://localhost:11434")
            if r.status_code in [200, 404]:
                print(f"✅ Ollama is up (after {i+1}s).")
                return
        except requests.exceptions.ConnectionError:
            pass
        print(f"⏳ Waiting for Ollama to start... {i+1}s")
        time.sleep(1)
    raise RuntimeError("❌ Ollama did not start in time.")

wait_for_ollama()


In [None]:
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared


In [None]:
import re

# Run cloudflared tunnel in background and get the public URL
cloudflared_proc = subprocess.Popen(
    ['./cloudflared', 'tunnel', '--url', 'http://localhost:11434', '--no-autoupdate'],
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True
)

public_url = None
for line in cloudflared_proc.stdout:
    print(line.strip())
    match = re.search(r'(https://.*\.trycloudflare\.com)', line)
    if match:
        public_url = match.group(1)
        break

if public_url:
    print(f"\n✅ Public URL for Ollama:\n{public_url}")
else:
    raise RuntimeError("❌ Could not find public Cloudflare URL.")


In [None]:
import json
import subprocess # Import subprocess if not already imported in this cell

data = {
    "model": MODEL_NAME,
    "prompt": "Question: What is the capital of Japan?\nAnswer:",
    "stream": False
}

# Check if the cloudflared process is still running
if cloudflared_proc.poll() is None:
    try:
        response = requests.post(f"{public_url}/api/generate", json=data)
        # Prettify the JSON output before printing
        print(json.dumps(response.json()['response'], indent=4))
    except requests.exceptions.ConnectionError as e:
        print(f"❌ Connection Error: Could not connect to {public_url}. The Cloudflare tunnel might be unstable or the Ollama service is not reachable via the tunnel.")
        print(e)
else:
    print("❌ Cloudflared tunnel process is not running. The public URL is likely invalid.")
    # You might want to add logic here to attempt restarting the tunnel or raise an error.

In [None]:
# Install Gradio for creating the web UI
!pip install gradio

## 🎨 Gradio Web UI for Chat Interface

This section implements a **Gradio-based chat interface** that provides:

### ✨ Features:
- 💬 **Interactive Chat**: Real-time conversation with your local LLM
- 🎛️ **Adjustable Parameters**: Control temperature, top-p, and max tokens
- 🔄 **Retry Functionality**: Retry the last message with different settings
- 🧹 **Clear Chat**: Start fresh conversations anytime
- 📊 **Model Information**: View current model details and status
- 🌐 **Public Access**: Shareable link for external access
- 🎨 **Modern UI**: Clean, responsive design with proper styling

### 🚀 Usage:
1. Run the cells below to install Gradio and set up the interface
2. The interface will be accessible via a public Gradio link
3. Use the chat interface to interact with your LLM
4. Adjust settings in the sidebar as needed

### 🔗 Integration:
The interface connects to your existing **Ollama server** through the **Cloudflare tunnel**, providing seamless access to your local LLM.

In [None]:
import gradio as gr
import json
import requests
from typing import List, Tuple

# Chat history storage
chat_history = []

def chat_with_llm(message: str, history: List[List[str]]) -> Tuple[str, List[List[str]]]:
    """
    Send a message to the LLM and return the response along with updated history.
    """
    if not message.strip():
        return "", history
    
    try:
        # Prepare the data for Ollama API
        data = {
            "model": MODEL_NAME,
            "prompt": message,
            "stream": False,
            "options": {
                "temperature": 0.7,
                "top_p": 0.9,
                "max_tokens": 2048
            }
        }
        
        # Check if cloudflared process is still running
        if cloudflared_proc.poll() is None:
            # Send request to Ollama API
            response = requests.post(f"{public_url}/api/generate", json=data, timeout=120)
            
            if response.status_code == 200:
                result = response.json()
                bot_response = result.get('response', 'No response generated.')
                
                # Update history
                history.append([message, bot_response])
                
                return "", history
            else:
                error_msg = f"❌ API Error: {response.status_code} - {response.text}"
                history.append([message, error_msg])
                return "", history
        else:
            error_msg = "❌ Cloudflared tunnel is not running. Please restart the tunnel."
            history.append([message, error_msg])
            return "", history
            
    except requests.exceptions.Timeout:
        error_msg = "⏱️ Request timed out. The model might be taking too long to respond."
        history.append([message, error_msg])
        return "", history
    except requests.exceptions.ConnectionError:
        error_msg = "🔌 Connection error. Please check if the tunnel is still active."
        history.append([message, error_msg])
        return "", history
    except Exception as e:
        error_msg = f"❌ Unexpected error: {str(e)}"
        history.append([message, error_msg])
        return "", history

def clear_chat():
    """
    Clear the chat history.
    """
    return [], []

def get_model_info():
    """
    Get information about the current model.
    """
    try:
        if cloudflared_proc.poll() is None:
            response = requests.get(f"{public_url}/api/tags", timeout=10)
            if response.status_code == 200:
                models = response.json().get('models', [])
                current_model = next((m for m in models if m['name'] == MODEL_NAME), None)
                if current_model:
                    return f"📋 **Current Model:** {MODEL_NAME}\n📏 **Size:** {current_model.get('size', 'Unknown')} bytes\n🏷️ **Modified:** {current_model.get('modified_at', 'Unknown')}"
        return f"📋 **Current Model:** {MODEL_NAME}\n⚠️ **Status:** Model info unavailable"
    except:
        return f"📋 **Current Model:** {MODEL_NAME}\n❌ **Status:** Cannot retrieve model info"

# Custom CSS for better styling
custom_css = """
.gradio-container {
    max-width: 1200px !important;
    margin: auto;
}

.chat-message {
    padding: 10px;
    margin: 5px 0;
    border-radius: 10px;
}

.user-message {
    background-color: #e3f2fd;
    margin-left: 20%;
}

.bot-message {
    background-color: #f5f5f5;
    margin-right: 20%;
}
"""

print("🎨 Setting up Gradio interface...")

In [None]:
# Create the Gradio interface
with gr.Blocks(css=custom_css, title="🤖 Local LLM Chat", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🤖 Local LLM Chat Interface
        
        Chat with your locally running LLM via Ollama. The model is accessible through a Cloudflare tunnel.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=3):
            # Main chat interface
            chatbot = gr.Chatbot(
                [],
                elem_id="chatbot",
                bubble_full_width=False,
                height=500,
                show_label=False
            )
            
            with gr.Row():
                msg = gr.Textbox(
                    placeholder="Type your message here...",
                    container=False,
                    scale=4,
                    show_label=False
                )
                send_btn = gr.Button("Send 📤", scale=1, variant="primary")
            
            with gr.Row():
                clear_btn = gr.Button("Clear Chat 🗑️", scale=1)
                retry_btn = gr.Button("Retry Last 🔄", scale=1)
        
        with gr.Column(scale=1):
            # Sidebar with model info and controls
            gr.Markdown("### 🔧 Model Information")
            model_info = gr.Markdown(get_model_info())
            
            gr.Markdown("### ⚙️ Settings")
            
            with gr.Accordion("Advanced Options", open=False):
                temperature = gr.Slider(
                    minimum=0.1,
                    maximum=2.0,
                    value=0.7,
                    step=0.1,
                    label="Temperature",
                    info="Controls randomness (lower = more focused)"
                )
                
                top_p = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.9,
                    step=0.05,
                    label="Top P",
                    info="Controls diversity (lower = more focused)"
                )
                
                max_tokens = gr.Slider(
                    minimum=100,
                    maximum=4096,
                    value=2048,
                    step=100,
                    label="Max Tokens",
                    info="Maximum response length"
                )
            
            refresh_info_btn = gr.Button("Refresh Model Info 🔄", size="sm")
    
    # Define the enhanced chat function with settings
    def chat_with_settings(message: str, history: List[List[str]], temp: float, top_p_val: float, max_tok: int):
        if not message.strip():
            return "", history
        
        try:
            data = {
                "model": MODEL_NAME,
                "prompt": message,
                "stream": False,
                "options": {
                    "temperature": temp,
                    "top_p": top_p_val,
                    "num_predict": max_tok
                }
            }
            
            if cloudflared_proc.poll() is None:
                response = requests.post(f"{public_url}/api/generate", json=data, timeout=120)
                
                if response.status_code == 200:
                    result = response.json()
                    bot_response = result.get('response', 'No response generated.')
                    history.append([message, bot_response])
                    return "", history
                else:
                    error_msg = f"❌ API Error: {response.status_code}"
                    history.append([message, error_msg])
                    return "", history
            else:
                error_msg = "❌ Cloudflared tunnel is not running."
                history.append([message, error_msg])
                return "", history
                
        except Exception as e:
            error_msg = f"❌ Error: {str(e)}"
            history.append([message, error_msg])
            return "", history
    
    def retry_last_message(history: List[List[str]], temp: float, top_p_val: float, max_tok: int):
        if not history:
            return history
        
        last_user_msg = history[-1][0]
        # Remove the last exchange
        history = history[:-1]
        # Resend the last user message
        _, updated_history = chat_with_settings(last_user_msg, history, temp, top_p_val, max_tok)
        return updated_history
    
    # Event handlers
    msg.submit(
        chat_with_settings,
        inputs=[msg, chatbot, temperature, top_p, max_tokens],
        outputs=[msg, chatbot]
    )
    
    send_btn.click(
        chat_with_settings,
        inputs=[msg, chatbot, temperature, top_p, max_tokens],
        outputs=[msg, chatbot]
    )
    
    clear_btn.click(
        clear_chat,
        outputs=[chatbot, chatbot]
    )
    
    retry_btn.click(
        retry_last_message,
        inputs=[chatbot, temperature, top_p, max_tokens],
        outputs=[chatbot]
    )
    
    refresh_info_btn.click(
        get_model_info,
        outputs=[model_info]
    )

print("✅ Gradio interface created successfully!")
print("🚀 Launching the web interface...")

# Launch the interface
demo.launch(
    share=True,  # Create a public link
    server_name="0.0.0.0",  # Allow external connections
    server_port=7860,  # Use a specific port
    show_error=True,
    debug=False
)

## 🎉 Gradio Interface Successfully Launched!

### 📱 How to Use:
1. **Click the public Gradio link** that appears above
2. **Type your questions** in the chat input field
3. **Adjust settings** in the right sidebar if needed:
   - **Temperature**: Controls creativity (0.1 = focused, 2.0 = creative)
   - **Top P**: Controls diversity (0.1 = narrow, 1.0 = diverse)
   - **Max Tokens**: Controls response length (100-4096)
4. **Use the buttons**:
   - 🚀 **Send**: Submit your message
   - 🗑️ **Clear Chat**: Reset conversation
   - 🔄 **Retry Last**: Regenerate the last response
   - 🔄 **Refresh Model Info**: Update model status

### 💡 Tips:
- The interface maintains **chat history** during your session
- **Lower temperature** = more consistent, focused responses
- **Higher temperature** = more creative, varied responses
- If you get errors, check that both **Ollama** and **Cloudflare tunnel** are still running

### 🔧 Troubleshooting:
- If the interface doesn't respond, scroll up to verify the **Cloudflare tunnel URL** is still active
- Check the **Model Information** panel to ensure your model is loaded
- Use **Retry Last** if a response seems incomplete or unsatisfactory