In [None]:

import os
import sys
from google.colab import userdata




from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
from huggingface_hub import hf_hub_download
from llama_cpp import Llama


NGROK_AUTH_TOKEN = userdata.get('NGROK_AUTH_TOKEN')

MODEL_REPO = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
MODEL_FILE = "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"




ngrok.set_auth_token(NGROK_AUTH_TOKEN)


print(f"Downloading {MODEL_FILE}...")
try:
    model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
    print(f"Model downloaded to: {model_path}")
except Exception as e:
    print(f"Error downloading model: {e}")
    sys.exit(1)


print("Loading Model into GPU memory...")
try:
    llm = Llama(
      model_path=model_path,
      n_gpu_layers=-1,
      n_ctx=4096,
      n_threads=8,
      verbose=False
    )

    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    sys.exit(1)


app = Flask(__name__)
CORS(app)

@app.route('/health', methods=['GET'])
def health():
    return jsonify({"status": "Model is ready", "model": MODEL_FILE})

@app.route('/generate', methods=['POST'])
def generate():
    import time
    start = time.time()
    data = request.json

    messages = data.get('messages', [])
    max_tokens = data.get('max_tokens', 256)
    temperature = data.get('temperature', 0.1)

    print("\nüì• /generate called")
    print(f"   ‚Ä¢ messages: {len(messages)}")
    print(f"   ‚Ä¢ max_tokens: {max_tokens}")

    try:
        print("üß† Starting generation...")
        response = llm.create_chat_completion(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            stop=["<|eot_id|>", "<|end_of_text|>"]
        )
        print(f"‚úÖ Generation done in {time.time() - start:.2f} seconds")
        return jsonify(response)

    except Exception as e:
        print(f"‚ùå Generation Error: {e}")
        return jsonify({"error": str(e)}), 500



ngrok.kill()


try:
    public_url = ngrok.connect(5000).public_url
    print(f"\nüöÄ SERVER RUNNING! Your Colab API URL is: {public_url}")
    print(f"üëâ Copy this URL and paste it into 'local_rag_app.py' on your machine.\n")
    app.run(port=5000)
except Exception as e:
    print(f"Error starting ngrok: {e}")

Downloading Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf...
Model downloaded to: /root/.cache/huggingface/hub/models--bartowski--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/bf5b95e96dac0462e2a09145ec66cae9a3f12067/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
Loading Model into GPU memory...


llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model loaded successfully!

üöÄ SERVER RUNNING! Your Colab API URL is: https://phrenic-patently-mac.ngrok-free.dev
üëâ Copy this URL and paste it into 'local_rag_app.py' on your machine.

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m



üì• /generate called
   ‚Ä¢ messages: 2
   ‚Ä¢ max_tokens: 256
üß† Starting generation...


INFO:werkzeug:127.0.0.1 - - [26/Nov/2025 16:27:07] "POST /generate HTTP/1.1" 200 -


‚úÖ Generation done in 1.38 seconds
