In [1]:
!pip install fastapi uvicorn pyngrok streamlit transformers datasets requests python-multipart nest_asyncio
!pip install faiss-cpu

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-none-any.whl.metadata (10 kB)
Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Collecting packaging<25,>=20 (from streamlit)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
import numpy as np
import faiss
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, pipeline
from pyngrok import ngrok
import uvicorn
import nest_asyncio

nest_asyncio.apply()

app = FastAPI()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

@app.on_event("startup")
async def startup_event():
    global tokenizer, model, qa_pipe, index, contexts
    
    # Load model
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
    qa_pipe = pipeline('question-answering', 
                      model='huytranduck/distilbert-finetuned-squadv2',
                      device=0 if torch.cuda.is_available() else -1)
    
    # Load dataset và tạo index
    dataset = load_dataset('squad_v2', split='validation')
    dataset = dataset.filter(
        lambda x: len(x['answers']['text']) > 0
    )

    contexts = [ex['context'] for ex in dataset]
    
    # Tạo embeddings
    embeddings = []
    for batch in batch_generator(dataset['context'], 32):
        inputs = tokenizer(batch, return_tensors='pt', 
                         padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs).last_hidden_state[:,0].cpu().numpy()
        embeddings.extend(outputs)
    
    index = faiss.IndexFlatL2(embeddings[0].shape[0])
    index.add(np.array(embeddings, dtype=np.float32))

def batch_generator(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i+batch_size]

class Query(BaseModel):
    question: str
    top_k: int = 3

@app.post("/search")
async def semantic_search(query: Query):
    # Tạo embedding cho câu hỏi
    inputs = tokenizer(query.question, return_tensors='pt').to(device)
    with torch.no_grad():
        q_embedding = model(**inputs).last_hidden_state[:,0].cpu().numpy()
    
    scores, indices = index.search(q_embedding.astype(np.float32), query.top_k)
    
    results = []
    for idx, score in zip(indices[0], scores[0]):
        context = contexts[idx]
        answer = qa_pipe(question=query.question, context=context)
        results.append({
            "score": float(score),
            "context": context,
            "answer": answer['answer'],
            "confidence": answer['score']
        })
    
    return {"results": results}

ngrok.set_auth_token("NGROK_AUTH_TOKEN")
public_url = ngrok.connect(8000).public_url
print(f"Public API URL: {public_url}")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

2025-05-14 17:17:29.261379: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747243049.459767      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747243049.514823      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading ngrok: 3%

        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        
  @app.on_event("startup")


Public API URL: https://096e-34-73-105-169.ngrok-free.app                                           


INFO:     Started server process [35]
INFO:     Waiting for application startup.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11873 [00:00<?, ? examples/s]

INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     113.23.123.13:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
INFO:     2405:4802:1ca2:f0d0:4160:ac35:e0fe:7fd3:0 - "POST /search HTTP/1.1" 200 OK
