<a href="https://colab.research.google.com/github/Hvaa1/Speech_to_text_and_Emotion_recognition/blob/main/UI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [None]:
%cd "/content/drive/MyDrive/ColabNotebooks/BTL_NLP/WhisperFT"

/content/drive/MyDrive/ColabNotebooks/BTL_NLP/WhisperFT


In [None]:
!pip install fastapi uvicorn pyngrok transformers torchaudio soundfile peft accelerate


Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.5.0


In [None]:
%%writefile app.py
from fastapi import FastAPI, UploadFile, File, WebSocket, Request
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
import torch
import torch.nn.functional as F
import torchaudio
import io
import numpy as np

# ================= INIT =================
app = FastAPI(title="Nhận diện cảm xúc qua giọng nói")
templates = Jinja2Templates(directory="templates")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ===== WHISPER =====
WHISPER_DIR = "/content/drive/MyDrive/ColabNotebooks/BTL_NLP/WhisperFT/whisper_merged"
whisper_processor = WhisperProcessor.from_pretrained(WHISPER_DIR)
whisper_model = WhisperForConditionalGeneration.from_pretrained(WHISPER_DIR)
whisper_model.to(device)
whisper_model.eval()

# ===== PHOBERT =====
PHOBERT_DIR = "/content/drive/MyDrive/ColabNotebooks/BTL_NLP/EmotionRecognition/Phobert_ft1"
phobert_tokenizer = AutoTokenizer.from_pretrained(PHOBERT_DIR)
phobert_model = AutoModelForSequenceClassification.from_pretrained(PHOBERT_DIR)
phobert_model.to(device)
phobert_model.eval()

labels_map_vi = {
    0: "Giận dữ",
    1: "Ghê tởm",
    2: "Sợ hãi",
    3: "Vui vẻ",
    4: "Buồn bã",
    5: "Ngạc nhiên",
    6: "Khác"
}

# ================= UTILS =================
def whisper_transcribe(waveform, sr):
    if sr != 16000:
        waveform = torchaudio.functional.resample(waveform, sr, 16000)

    if waveform.ndim > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    waveform = waveform.squeeze().numpy()

    inputs = whisper_processor.feature_extractor(
        waveform,
        sampling_rate=16000,
        return_tensors="pt"
    ).input_features.to(device)

    with torch.no_grad():
        ids = whisper_model.generate(
          inputs,
          max_length=225,
          no_repeat_ngram_size=3,
          repetition_penalty=1.2,
          length_penalty=1.0,
          early_stopping=True
        )

    return whisper_processor.tokenizer.decode(
        ids[0],
        skip_special_tokens=True
    ).strip().lower()


def predict_emotion(text):
    if len(text.split()) < 2:
        return "Khác", 1.0

    enc = phobert_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        logits = phobert_model(**enc).logits

    probs = F.softmax(logits, dim=-1)
    conf, pred = torch.max(probs, dim=-1)

    return labels_map_vi[pred.item()], float(conf.item())


# ================= ROUTES =================
@app.get("/", response_class=HTMLResponse)
def home(request: Request):
    return templates.TemplateResponse("index.html", {"request": request})


@app.post("/analyze")
async def analyze_audio(file: UploadFile = File(...)):
    audio_bytes = await file.read()
    waveform, sr = torchaudio.load(io.BytesIO(audio_bytes))

    text = whisper_transcribe(waveform, sr)
    emotion, confidence = predict_emotion(text)

    return {
        "text": text,
        "emotion": emotion,
        "confidence": confidence
    }


@app.websocket("/ws/stream")
async def websocket_stream(ws: WebSocket):
    await ws.accept()
    audio_buffer = []

    try:
        while True:
            data = await ws.receive_bytes()
            chunk = np.frombuffer(data, dtype=np.float32)
            audio_buffer.append(chunk)

            if sum(len(x) for x in audio_buffer) >= 16000 * 2:
                waveform = np.concatenate(audio_buffer)
                audio_buffer = []

                inputs = whisper_processor.feature_extractor(
                    waveform,
                    sampling_rate=16000,
                    return_tensors="pt"
                ).input_features.to(device)

                with torch.no_grad():
                    ids = whisper_model.generate(inputs, max_length=225)

                text = whisper_processor.tokenizer.decode(
                    ids[0],
                    skip_special_tokens=True
                ).strip().lower()

                emotion, conf = predict_emotion(text)

                await ws.send_json({
                    "text": text,
                    "emotion": emotion,
                    "confidence": conf
                })

    except Exception as e:
        print("WebSocket closed:", e)


Overwriting app.py


In [None]:
# Chạy uvicorn ở chế độ background thực sự, log sẽ được ghi vào server.log thay vì in ra màn hình
!nohup uvicorn app:app --host 0.0.0.0 --port 8000 > server.log 2>&1 &
print("Server đang chạy ngầm. Bạn có thể chạy cell tiếp theo.")

Server đang chạy ngầm. Bạn có thể chạy cell tiếp theo.


In [None]:
!ngrok config add-authtoken 36u7hrCwA1WKNLFmgyBzVhQmYFP_5sYRAKVWnAbPBFcKzzA29


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from pyngrok import ngrok
public_url = ngrok.connect(8000)
print(public_url)

NgrokTunnel: "https://42cd12b33517.ngrok-free.app" -> "http://localhost:8000"


In [None]:
!pip install torchcodec

Collecting torchcodec
  Downloading torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.9.1
