In [None]:
# --- Install Unsloth and dependencies (Colab) ---
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [None]:
# --- Load Model and Tokenizer ---
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



# --- Prepare for Inference ---
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3",
    mapping = {"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
)

FastLanguageModel.for_inference(model)  # Enable fast inference

# # --- Inference Function ---
# def get_llm_response(prompt, max_new_tokens=128):
#     messages = [{"from": "human", "value": prompt}]
#     inputs = tokenizer.apply_chat_template(
#         messages,
#         tokenize=True,
#         add_generation_prompt=True,
#         return_tensors="pt"
#     ).to("cuda")
#     outputs = model.generate(input_ids=inputs, max_new_tokens=max_new_tokens, use_cache=True)
#     response = tokenizer.batch_decode(outputs)[0]
#     return response

In [None]:
def get_llm_response(prompt, max_new_tokens=128):
    messages = [{"from": "human", "value": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    outputs = model.generate(input_ids=inputs, max_new_tokens=max_new_tokens, use_cache=True)
    response = tokenizer.batch_decode(outputs)[0]

    # Extract only the assistant's reply
    if "<|assistant|>" in response:
        reply = response.split("<|assistant|>")[-1]
        # Remove any trailing special tokens
        reply = reply.split("<|end|>")[0].strip()
        return reply
    else:
        return response.strip()

In [None]:
# --- Example Usage ---
print(get_llm_response("Whats are some popular dog breeds ?"))

In [None]:
!pip install flask pyngrok flask-cors

from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
from threading import Thread

# Set your ngrok authtoken (replace with your actual token)
ngrok.set_auth_token("2tpeLhQarUCoqaTW6vb2nGNCtza_4RTPRqBnKy4V74pQ9v53r")

app = Flask(__name__)
CORS(app)  # Enable CORS for all origins

@app.route("/predict", methods=["POST"])
def predict():
    data = request.get_json(force=True)
    prompt = data.get("prompt", "")
    response = get_llm_response(prompt)
    return jsonify({"response": response})

def run():
    app.run(port=7070)
Thread(target=run).start()

public_url = ngrok.connect(7070, bind_tls=True, domain="tightly-fit-tetra.ngrok-free.app")
print(" * ngrok tunnel:", public_url)

In [None]:
from pyngrok import ngrok
import os

# Kill all ngrok tunnels
ngrok.kill()

# Kill any Flask servers (optional, but helps)
os.system("pkill -f flask")