# Voice-to-Insights AI system 

 * that transforms audio files into structured, actionable insights using **Fast Whisper**, **LLMs**,
    **Sentiment Analysis**, **FastAPI REST API**, and **Dash**
    
* **You Can Check Project GitHub Repo From [this link](https://github.com/Nagwam18/Voice-to-Insights-Processing-System/tree/main)**

# Installation

In [None]:
# requirements.txt
# Core LLM dependencies
# transformers==4.45.2
# accelerate==0.34.2
# safetensors==0.4.3
# sentencepiece

# # PyTorch + CUDA 11.8 (for Kaggle & Llama)
# torch==2.3.1+cu118
# torchvision==0.18.1+cu118
# torchaudio==2.3.1+cu118
# --extra-index-url https://download.pytorch.org/whl/cu118

# # # Optional dependencies
# av
# # Core LLM dependencies
# transformers==4.45.2
# accelerate==0.34.2
# safetensors==0.4.3
# sentencepiece

# # PyTorch + CUDA 11.8 (for Kaggle & Llama)
# torch==2.3.1+cu118
# torchvision==0.18.1+cu118
# torchaudio==2.3.1+cu118
# -f https://download.pytorch.org/whl/cu118

# # Optional dependencies
# av
# protobuf==5.26.1

!pip install -r /kaggle/input/install-2/requirements.txt


In [None]:
!pip install pydantic-ai
!pip install huggingface_hub
!pip install pyngrok
!pip install dash dash-bootstrap-components requests pydub

# Imports

In [None]:
# Core utilities
import re
import json
from threading import Thread

# Machine learning & NLP libraries
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Whisper speech-to-text
from faster_whisper import WhisperModel

# FastAPI server
from fastapi import FastAPI
from fastapi.responses import JSONResponse
import uuid
import uvicorn
import nest_asyncio
import requests

#pydantic set-up
from typing import List, Optional
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from typing import List
from transformers import pipeline
from pydantic_ai.models.huggingface import HuggingFaceModel


# login to Hugging Face

In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
import os
os.environ["HF_TOKEN"] = "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"


In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
def get_transcription(file_path):
    model_size = "large-v2"
    model = WhisperModel(model_size, device="cuda", compute_type="float16")
    segments, _ = model.transcribe(file_path, beam_size=5)
    
    transcription = "".join(segment.text for segment in segments)
    return transcription.strip()


In [None]:
# voice_path= "/kaggle/input/test-voice/test1.mp3"
# voice_path="/kaggle/input/test-voice/test2.mp3"
# voice_path="/kaggle/input/test-voice/test3.mp3"
# voice_path="/kaggle/input/test2-voice/test4.mp3"
# voice_path="/kaggle/input/test3-voice/test5.mp3"
voice_path="/kaggle/input/test3-voice/test6.mp3"

# print("\nTranscript")
transcript = get_transcription(voice_path)
print(transcript)

In [None]:
class Insights(BaseModel):
    summary: str = Field(description="Concise summary of the input text in 2–3 sentences")
    
    entities: List[str] = Field(description="Important names, identifiers, objects, or key terms mentioned")
    
    actions: List[str] = Field(description="Explicit or implicit actions, requests, or next steps described")


In [None]:
PROMPT = """
You are a universal information extraction engine.

Analyze the input text regardless of domain (technical, business, casual, medical, legal, etc.).
Your task is to extract structured insights that strictly follow the provided schema.

Rules:
1. Return ONLY valid JSON that matches the schema exactly.
2. Do NOT add extra fields or explanations.
3. 'entities' must be concrete and specific (IDs, names, objects, issues).
4. 'actions' must describe what happened or what should happen next.
5. Infer implicit actions when reasonable.

Be concise, accurate, and deterministic.
"""


In [None]:
agent = Agent(
    model=HuggingFaceModel("Qwen/Qwen2.5-7B-Instruct"),
    output_type=Insights,
    system_prompt=PROMPT
)


In [None]:
async def analyze_text(text: str) -> dict:
    result = await agent.run(text)   
    insight: Insights = result.output
    return insight.model_dump()

insight_json = await analyze_text(transcript)
print(insight_json)


In [None]:
 def sentiment_analysis(transcript):
    sentiment_analyzer = pipeline("sentiment-analysis", device=0)
    result = sentiment_analyzer(transcript)[0]
    
    # print("Raw sentiment:", result)
    
    if result['label'] == "NEGATIVE":
        sentiment = "Frustrated but cooperative"
    elif result['label'] == "POSITIVE":
        sentiment = "Satisfied and cooperative"
    else:
        sentiment = "Neutral but cooperative"
    
    return sentiment

# Fast API APP

In [None]:
nest_asyncio.apply()
app = FastAPI()

results_store = {}

@app.post("/process_audio")
async def process_audio():
    session_id = str(uuid.uuid4())
    
    # voice_path= "/kaggle/input/test-voice/test1.mp3"
    # voice_path="/kaggle/input/test-voice/test2.mp3"
    # voice_path="/kaggle/input/test-voice/test3.mp3"
    # voice_path="/kaggle/input/test2-voice/test4.mp3"
    # voice_path="/kaggle/input/test3-voice/test5.mp3"
    voice_path="/kaggle/input/test3-voice/test6.mp3"



    transcript = get_transcription(voice_path)
    insights_json = await analyze_text(transcript)
    sentiment_json = sentiment_analysis(transcript)


    
    results_store[session_id] = {
        "status": "completed",
        "results": { "transcript":transcript,
                     "insights": insights_json,
                     "sentiment": sentiment_json }
                     }

    return JSONResponse({
        "session_id": session_id,
        "status": "processing",
        "message": "Audio path received. Processing started."
    })


@app.get("/results/{session_id}")
async def get_results(session_id: str):
    if session_id not in results_store:
        return JSONResponse({"error": "Session not found"}, status_code=404)

    return JSONResponse({
        "session_id": session_id,
        "results": results_store[session_id]["results"],
        "processing_status": results_store[session_id]["status"],

    })
import nest_asyncio
nest_asyncio.apply()
import time

def run_api():
  uvicorn.run(app, host="0.0.0.0", port=8003)

thread = Thread(target=run_api, daemon=True)
thread.start()
time.sleep(2)

In [None]:
resp = requests.post("http://127.0.0.3:8003/process_audio")
session_id = resp.json()["session_id"]
resp_results = requests.get(f"http://127.0.0.3:8003/results/{session_id}")

print("GET status:", resp_results.status_code)
print("GET response:")
print(json.dumps(resp_results.json(), indent=4))   

In [None]:
from pyngrok import ngrok, conf
conf.get_default().auth_token = "36F0BxzfgoiXChAeN7oJ4MflnlF_2AjXn8jbAnkxHQh8WLAiT"

In [None]:
import dash
from dash import html, dcc, Input, Output, State
import dash_bootstrap_components as dbc
import base64
import tempfile
import asyncio
from threading import Thread
from pyngrok import ngrok

app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
server = app.server
app.layout = dbc.Container([
    html.H1("Voice-to-Insights Processing System", className="text-center my-4",style={"fontSize": "50px"}),

    dbc.Row([
        dbc.Col([
            dcc.Upload(
                id='upload-audio',
                children=html.Div(id="upload-text", children=['Drag and Drop or ', html.A('Select Audio File')]),
                style={'width':'100%','height':'60px','lineHeight':'60px','borderWidth':'1px',
                       'borderStyle':'dashed','borderRadius':'5px','textAlign':'center','margin-bottom':'10px'},
                multiple=False
            ),
            dbc.Button("Process Audio", id="process-btn", color="primary", n_clicks=0),
            html.Div(id="loading-output", className="mt-2"),
            html.Div(id="ngrok-link", className="mt-2")
        ], width=6)
    ], justify="center"),

    dbc.Row([
        dbc.Col([
            html.H4("Transcript"),
            dbc.Card(dbc.CardBody(id="transcript-output"), className="mb-3"),

            html.H4("Summary"),
            dbc.Card(dbc.CardBody(id="summary-output"), className="mb-3"),
        ], width=6),

        dbc.Col([
            html.H4("Entities"),
            dbc.Card(dbc.CardBody(id="entities-output"), className="mb-3"),

            html.H4("Actions"),
            dbc.Card(dbc.CardBody(id="actions-output"), className="mb-3"),

            html.H4("Sentiment"),
            dbc.Card(dbc.CardBody(id="sentiment-output"), className="mb-3"),
        ], width=6),
    ])
], fluid=True)

@app.callback(
    Output("upload-text", "children"),
    Input("upload-audio", "contents")
)
def update_upload_text(contents):
    if contents:
        return "Audio uploaded ✅"
    return ['Drag and Drop or ', html.A('Select Audio File')]


@app.callback(
    Output("loading-output", "children"),
    Output("transcript-output", "children"),
    Output("summary-output", "children"),
    Output("entities-output", "children"),
    Output("actions-output", "children"),
    Output("sentiment-output", "children"),
    Input("process-btn", "n_clicks"),
    State("upload-audio", "contents")
)
def process_audio_callback(n_clicks, audio_contents):
    if n_clicks == 0 or audio_contents is None:
        return "", "", "", "", "", ""

    loading_spinner = dbc.Spinner(size="sm", color="primary", children="Processing audio...")

    header, encoded = audio_contents.split(",", 1)
    audio_bytes = base64.b64decode(encoded)
    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    tmp_file.write(audio_bytes)
    tmp_file.close()

    transcript = get_transcription(tmp_file.name)
    insights_json = asyncio.run(analyze_text(transcript))
    sentiment_json = sentiment_analysis(transcript)

    return (
        "",  
        transcript,
        insights_json["summary"],
        ", ".join(insights_json["entities"]),
        "\n".join(insights_json["actions"]),
        sentiment_json
    )

def run_dash_with_ngrok():
    port = 8050
    public_url = ngrok.connect(port)
    print("Open this link in your browser:", public_url)

    @app.callback(
        Output("ngrok-link", "children"),
        Input("upload-audio", "contents")
    )
    def show_ngrok_link(_):
        return html.A(public_url, href=public_url, target="_blank")

    app.run(port=port, debug=False)

thread = Thread(target=run_dash_with_ngrok)
thread.start()
