## Requirements

In [1]:
!pip install git+https://github.com/huggingface/transformers.git
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install bitsandbytes accelerate
!pip install -q fastapi uvicorn nest-asyncio pyngrok transformers

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-efg0yk4b
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-efg0yk4b
  Resolved https://github.com/huggingface/transformers.git to commit ccb2e0e03b41429eeede933f38c80e36fcee772f
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers==4.55.0.dev0)
  Downloading huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Downloading huggingface_hub-0.34.3-py3-none-any.whl (558 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) 

## For base model

In [26]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load base (non-quantized) Airavata model
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/Airavata")
model = AutoModelForCausalLM.from_pretrained(
    "ai4bharat/Airavata",
    device_map="auto",
    torch_dtype=torch.bfloat16  # You can also try float16
)

# Save base model
save_dir = "./Base_Airavata"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Saving checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

('./Base_Airavata/tokenizer_config.json',
 './Base_Airavata/special_tokens_map.json',
 './Base_Airavata/tokenizer.model',
 './Base_Airavata/added_tokens.json',
 './Base_Airavata/tokenizer.json')

## For quantized model

In [25]:
import torch
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,        # or load_in_8bit=True for 8-bit quant
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16  # or float16 depending on your GPU
)

# Load tokenizer and quantized model
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/Airavata")
model = AutoModelForCausalLM.from_pretrained(
    "ai4bharat/Airavata",
    quantization_config=bnb_config,
    device_map='auto'
)

# Save both the tokenizer and the quantized model
save_dir = "./Quantized_Airavata"

# Save model
model.save_pretrained(save_dir)

# Save tokenizer
tokenizer.save_pretrained(save_dir)

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

## App

In [27]:
%%writefile app.py
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch

app = FastAPI()

# model_path = "./Quantized_Airavata" for quantized one
model_path = "/content/Base_Airavata" # base model


tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16

)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Root endpoint (fixes "Not Found" at /)
@app.get("/")
def root():
    return {"message": "Airavata FastAPI server is running. Use /docs to try it."}

# Input model
class GenerateRequest(BaseModel):
    prompt: str
    max_new_tokens: int = 100

# POST endpoint for text generation
@app.post("/generate")
async def generate(req: GenerateRequest):
    output = pipe(
        req.prompt,
        max_new_tokens=req.max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    return {"generated_text": output[0]["generated_text"]}


Overwriting app.py


In [28]:
!ngrok config add-authtoken 30YDTbaj7jktj6lY9j0QhwKZkx5_2D8sbtzf55HNqEScKc8Nf

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [29]:
!pkill -f ngrok




In [30]:
import nest_asyncio
import uvicorn
from pyngrok import ngrok
import threading

nest_asyncio.apply()

# Start tunnel
public_url = ngrok.connect(8000)
print("Public URL:", public_url.public_url)
print("Go to this URL and append /docs to test the API:")
print(public_url.public_url + "/docs")

# Start server in a thread
def run():
    uvicorn.run("app:app", host="0.0.0.0", port=8000)

thread = threading.Thread(target=run)
thread.start()



Public URL: https://befee853b291.ngrok-free.app
Go to this URL and append /docs to test the API:
https://befee853b291.ngrok-free.app/docs


In [31]:
import requests
import time

url = public_url.public_url + "/generate"



data = {
    "prompt": "how to manage time effectively ?",
    "max_new_tokens": 20
}

n_requests = 20
times = []

# Warm-up
requests.post(url, json=data)

for _ in range(n_requests):
    start = time.time()
    _ = requests.post(url, json=data)
    times.append(time.time() - start)

avg_latency = sum(times) / n_requests
throughput = n_requests / sum(times)

print(f"Avg Latency: {avg_latency * 1000:.2f} ms")
print(f"Throughput: {throughput:.2f} requests/sec")

INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     34.91.245.146: