In [1]:
!pip install yt-dlp
!pip install moviepy
!pip install lingua-language-detector
!pip install -U openai-whisper
!pip install langgraph
!pip install speechbrain
!pip install langchain-ollama
!pip install langchain-openai



## Download the video and extract the audio

In [2]:
import os
import tempfile
import subprocess
import requests
from moviepy.editor import VideoFileClip
from langchain_core.tools import tool
import whisper
from lingua import Language, LanguageDetectorBuilder
import torchaudio
from speechbrain.pretrained import EncoderClassifier
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import create_react_agent
from langchain_core.language_models import BaseLanguageModel
from langchain_ollama import OllamaLLM
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_ollama import ChatOllama
from langchain.chat_models import init_chat_model
import getpass
from google.colab import userdata

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover
  from speechbrain.pretrained import EncoderClassifier



In [3]:
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

In [4]:


@tool
def extract_audio_from_video_url(url: str) -> str:
    """
    Extract audio from a public video URL (direct .mp4 or streaming platform like Loom/YouTube).
    Returns the local path to the extracted audio file (.wav or .mp3).
    """
    def is_direct_mp4(url: str) -> bool:
      return url.strip().lower().endswith(".mp4")
    try:
        if is_direct_mp4(url):
            # Download the MP4 video
            response = requests.get(url, stream=True)
            response.raise_for_status()

            tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
            with open(tmp_video.name, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

            # Extract audio using moviepy
            clip = VideoFileClip(tmp_video.name)
            audio_path = tmp_video.name.replace(".mp4", ".wav")
            clip.audio.write_audiofile(audio_path)
            return audio_path

        else:
            # Use yt-dlp to download and extract audio from streaming URL
            tmp_dir = tempfile.mkdtemp()
            command = [
                "yt-dlp",
                "-x", "--audio-format", "mp3",
                "-o", os.path.join(tmp_dir, "%(title)s.%(ext)s"),
                url
            ]
            subprocess.run(command, check=True)

            # Find the downloaded .mp3 file
            files = os.listdir(tmp_dir)
            audio_files = [f for f in files if f.endswith(".mp3")]
            if not audio_files:
                raise FileNotFoundError("No MP3 file found in the output directory.")

            return os.path.join(tmp_dir, audio_files[0])

    except Exception as e:
        return f"❌ Error extracting audio: {str(e)}"

In [5]:
url = "https://www.loom.com/share/e2620712840948e3bad08334d8e56eb9?sid=82c3b48a-c448-45d4-a4dd-4a572817fc51"
audio_path = extract_audio_from_video_url.invoke(url)
print("Audio file saved at:", audio_path)

Audio file saved at: /tmp/tmpgd9luf1v/Intelligent Multi-Agent Systems Overview.mp3


## build the agent to check if it is an English

In [6]:
model = whisper.load_model("turbo")

In [7]:
result = model.transcribe(audio_path)
print(result["text"])

 Hi, I'm Yutong, an A-Engineer and graduate student at Rice University, specializing in building intelligent multi-agent systems. I design A-Agent that collaborate using retrieval reasoning and planning, and task-alized legal analysis, trading strategies, and enterprise workflows. I have a planning system with Lanchine and Lanchine graph and vector database to tune LMS into dynamic decision makers. I'm now looking for founders to help bring this information into scalable, real-world products. Thank you.


In [8]:
@tool
def get_text_from_audio(audio_path: str) -> str:
  """
  Transcribes an audio file and returns the transcribed text.
  """
  model = whisper.load_model("turbo")
  result = model.transcribe(audio_path)
  return result["text"]

In [9]:
def detect_language_with_confidence(text: str) -> dict:
    # Build detector
    detector = LanguageDetectorBuilder.from_all_languages().build()

    # Detect the most likely language
    detected_language = detector.detect_language_of(text)
    detected_language_name = detected_language.name

    # Get all confidence values
    confidence_values = detector.compute_language_confidence_values(text)

    # Find the confidence score for the detected language
    confidence_score = None
    for confidence in confidence_values:
        if confidence.language == detected_language:
            confidence_score = round(confidence.value, 4)
            break

    return {
        "language": detected_language_name,
        "confidence": confidence_score
    }

In [10]:
detect_language_with_confidence(result["text"])

{'language': 'ENGLISH', 'confidence': 1.0}

## If it is English, we classify the accent

In [11]:
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/accent-id-commonaccent_ecapa/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'Jzuluaga/accent-id-commonaccent_ecapa' if not cached
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/accent-id-commonaccent_ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/pre

In [12]:
out_prob, score, index, text_lab = classifier.classify_file(audio_path)



In [13]:
text_lab

['us']

In [14]:
score

tensor([0.6970])

In [15]:
@tool
def detect_language_with_confidence(text: str) -> dict:
    """Detects the most likely language of the input text and returns its confidence score."""

    detector = LanguageDetectorBuilder.from_all_languages().build()

    # Detect the most likely language
    detected_language = detector.detect_language_of(text)
    detected_language_name = detected_language.name

    # Get all confidence values
    confidence_values = detector.compute_language_confidence_values(text)

    # Find the confidence score for the detected language
    confidence_score = None
    for confidence in confidence_values:
        if confidence.language == detected_language:
            confidence_score = round(confidence.value, 4)
            break

    return {
        "language": detected_language_name,
        "confidence": confidence_score
    }

In [16]:
@tool
def classify_accent(audio_path: str) -> dict:
    """Classifies the accent from an English speech audio file and returns the accent and confidence score."""

    classifier = EncoderClassifier.from_hparams(
        source="Jzuluaga/accent-id-commonaccent_ecapa",
        savedir="pretrained_models/accent-id-commonaccent_ecapa"
    )
    out_prob, score, index, text_lab = classifier.classify_file(audio_path)

    return {
        "accent": text_lab,
        "confidence": round(float(score), 4)
    }

In [17]:
from langchain_openai import ChatOpenAI

In [18]:
# Initialize the LLM (replace with your preferred provider if needed)


@tool
def summarize_text(text: str) -> str:

    """Summarizes a given block of text using a language model."""
    llm = ChatOpenAI(model_name="gpt-4o-mini")
    prompt = (
        "Please provide a short, clear summary of the following text:\n\n"
        f"{text}\n\n"
        "Summary:"
    )

    return llm.invoke(prompt)

In [19]:
@tool
def audio_question_answer(audio_path: str, question: str) -> str:
    """
    Transcribes an audio file, creates embeddings with Ollama, and answers a user question using RAG.
    Input: path to audio file (.mp3/.wav), and a question in string form.
    Output: Answer from RAG-augmented DeepSeek model.
    """

    # ========== SETUP ==========
    embeddings = OllamaEmbeddings(model="llama3")
    model = OllamaLLM(model="llama3")
    vector_store = InMemoryVectorStore(embeddings)

    template = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    Question: {question}
    Context: {context}
    Answer:
    """

    # ========== TRANSCRIBE ==========
    whisper_model = whisper.load_model("medium.en")
    result = whisper_model.transcribe(audio_path)
    transcript = result["text"]

    # ========== SPLIT ==========
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
    chunks = splitter.split_text(transcript)
    vector_store.add_texts(chunks)

    # ========== RETRIEVE ==========
    retrieved_docs = vector_store.similarity_search(question)
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    # ========== ANSWER ==========
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model
    raw_answer = chain.invoke({"question": question, "context": context})

    # ========== CLEAN ==========
    return re.sub(r"<think>.*?</think>", "", raw_answer, flags=re.DOTALL).strip()





In [28]:
# memory = MemorySaver()
model = init_chat_model("gpt-4o-mini", model_provider="openai")
tools = [extract_audio_from_video_url, get_text_from_audio, detect_language_with_confidence, classify_accent, summarize_text]

In [21]:
prompt = """
You are an AI assistant specialized in analyzing spoken English from video content.

Your task is to process a video provided by the user at this URL: {url}

Follow these steps carefully and sequentially:

1. Use the tool `extract_audio_from_video_url` to download the video and extract its audio.
   - Save the local path to the resulting audio file as {audio_path}.

2. Use the tool `get_text_from_audio` to transcribe the audio and convert it into text.
   - Save the transcribed text as {text}.

3. Use the tool `detect_language_with_confidence` to determine the primary spoken language with {text}.
   - Capture both the language name and its confidence score.

4. If the detected language is English:
   a. Use the tool `classify_accent` to identify the speaker's English accent and provide a confidence score with {audio_path}.
   b. Use the tool `summarize_text` to summarize the transcription in 3–4 concise bullet points as {summary} with {text}.

5. If the language is not English, tell which language it is and clearly state that only English-language processing is currently supported.



"""


In [22]:
agent = create_react_agent(model, tools=tools, prompt=prompt)

In [23]:
from langchain_core.messages import HumanMessage


In [24]:
response = agent.invoke({"messages": [HumanMessage(content=f"the video url is {url}")]})

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/accent-id-commonaccent_ecapa/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'Jzuluaga/accent-id-commonaccent_ecapa' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/accent-id-commonaccent_ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/pretrained_models/accent-id-commonaccent_ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/pretrained_models/accent-id-commonaccent_ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/pretrained_models/accent-id-commonaccent_ecapa/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = 

In [26]:
final_message = response.get("output") or response.get("messages")[-1].content
print(final_message)

Here are the results from the audio analysis:

### Detected Language
- **Language**: English
- **Confidence**: 1.0

### Accent Classification
- **Accent**: US
- **Confidence**: 0.697

### Summary of the Transcription
- Yutong is an A-Engineer and graduate student at Rice University focused on developing intelligent multi-agent systems.
- They design A-Agents that collaborate through retrieval reasoning and planning, specifically for legal analysis, trading strategies, and enterprise workflows.
- Yutong has created a planning system utilizing Lanchine graphs and a vector database to enhance decision-making capabilities.
- They are seeking founders to help translate this technology into scalable, real-world products.
