<a href="https://colab.research.google.com/github/Karunesh2580/voice-ai-call-analyzer/blob/main/callanalyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip uninstall -y youtube_dl
!pip install yt-dlp pydub speechrecognition transformers torch librosa

import os
import yt_dlp
from pydub import AudioSegment
import speech_recognition as sr
import torch
from transformers import pipeline
import librosa
import numpy as np

# 1. Download YouTube video audio using yt-dlp
def download_audio(youtube_url, output_path='call_audio.wav'):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': 'temp_audio.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'quiet': True,
        'no_warnings': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])
    # Rename the extracted wav file to output_path
    os.rename('temp_audio.wav', output_path)
    return output_path

# 2. Load audio and preprocess
def load_audio(audio_path):
    audio = AudioSegment.from_wav(audio_path)
    return audio

# 3. Speech recognition to get transcript
def transcribe_audio(audio_path):
    r = sr.Recognizer()
    audio_file = sr.AudioFile(audio_path)
    with audio_file as source:
        audio_data = r.record(source)
    try:
        # Use Google's free API (limited but works for short audio)
        text = r.recognize_google(audio_data)
    except Exception as e:
        text = ""
    return text

# 4. Calculate talk-time ratio by detecting speech presence using energy threshold
def calculate_talk_time(audio_path):
    y, sr_ = librosa.load(audio_path, sr=None)
    frame_length = int(sr_ * 0.5)  # 0.5 sec frames
    hop_length = frame_length
    energy = np.array([
        sum(abs(y[i:i+frame_length]**2))
        for i in range(0, len(y), hop_length)
    ])
    threshold = np.percentile(energy, 60)
    speech_frames = energy > threshold
    total_frames = len(speech_frames)
    speech_time = sum(speech_frames) * 0.5
    total_time = len(y) / sr_
    talk_time_ratio = speech_time / total_time
    return talk_time_ratio, total_time

# 5. Count number of questions asked (count '?' in transcript)
def count_questions(transcript):
    return transcript.count('?')

# 6. Longest monologue duration (longest continuous speech segment)
def longest_monologue(audio_path):
    y, sr_ = librosa.load(audio_path, sr=None)
    frame_length = int(sr_ * 0.5)
    hop_length = frame_length
    energy = np.array([
        sum(abs(y[i:i+frame_length]**2))
        for i in range(0, len(y), hop_length)
    ])
    threshold = np.percentile(energy, 60)
    speech_frames = energy > threshold
    max_len = 0
    current_len = 0
    for frame in speech_frames:
        if frame:
            current_len += 0.5
            max_len = max(max_len, current_len)
        else:
            current_len = 0
    return max_len

# 7. Sentiment analysis on transcript
def analyze_sentiment(text):
    sentiment_pipeline = pipeline("sentiment-analysis")
    # Limit input size to 512 tokens for transformer
    result = sentiment_pipeline(text[:512])
    label = result[0]['label']
    if label == 'POSITIVE':
        return 'positive'
    elif label == 'NEGATIVE':
        return 'negative'
    else:
        return 'neutral'

# 8. Actionable insight (simple heuristic based on keywords)
def actionable_insight(transcript):
    transcript_lower = transcript.lower()
    if 'price' in transcript_lower:
        return "Consider clarifying pricing details."
    elif 'interested' in transcript_lower:
        return "Follow up on customer's interest."
    else:
        return "Try to ask more open-ended questions."

# Main execution
youtube_url = "https://www.youtube.com/watch?v=4ostqJD3Psc"
print("Downloading audio...")
audio_path = download_audio(youtube_url)
print("Audio downloaded and converted to WAV.")

print("Loading audio...")
audio = load_audio(audio_path)

print("Transcribing audio...")
transcript = transcribe_audio(audio_path)
print(f"Transcript: {transcript}")

print("Calculating talk-time ratio...")
talk_time_ratio, total_time = calculate_talk_time(audio_path)
print(f"Talk-time ratio (speech presence): {talk_time_ratio*100:.2f}% of total {total_time:.2f} seconds")

print("Counting questions asked...")
questions = count_questions(transcript)
print(f"Number of questions asked: {questions}")

print("Calculating longest monologue duration...")
longest_mono = longest_monologue(audio_path)
print(f"Longest monologue duration: {longest_mono:.2f} seconds")

print("Analyzing sentiment...")
sentiment = analyze_sentiment(transcript)
print(f"Call sentiment: {sentiment}")

print("Generating actionable insight...")
insight = actionable_insight(transcript)
print(f"Actionable insight: {insight}")

[0mCollecting yt-dlp
  Downloading yt_dlp-2025.9.5-py3-none-any.whl.metadata (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.1/177.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting speechrecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Downloading yt_dlp-2025.9.5-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp, speechrecognition
Successfully installed speechrecognition-3.14.3 yt-dlp-2025.9.5


  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


Downloading audio...
Audio downloaded and converted to WAV.
Loading audio...
Transcribing audio...
Transcript: thank you for calling Nissan my name is Lauren can I have your name my name is John Smith thank you John how can I help you I was just calling about to see how much it would cost to update the map in my car I'd be happy to help you with that today did you receive a letter from us I did do you need the customer number yes please okay it's 15243 thank you and thank you I have a 2009 Nissan Altima Nissan
Calculating talk-time ratio...
Talk-time ratio (speech presence): 39.94% of total 122.69 seconds
Counting questions asked...
Number of questions asked: 0
Calculating longest monologue duration...


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Longest monologue duration: 4.00 seconds
Analyzing sentiment...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


Call sentiment: positive
Generating actionable insight...
Actionable insight: Try to ask more open-ended questions.
