In [1]:
from google.colab import userdata
from huggingface_hub import login

# Retrieve the token from Colab secrets
hf_token = userdata.get('HF_TOKEN')

# Login to the Hugging Face Hub using the token
if hf_token:
    login(hf_token)
    print("Successfully logged in to Hugging Face!")
else:
    print("HF_TOKEN secret not found or notebook access not enabled.")


Successfully logged in to Hugging Face!


In [3]:
!pip uninstall -y numpy torch torchvision torchaudio transformers torchcodec pyannote.audio
!pip uninstall -y torch torchvision torchaudio transformers torchcodec pyannote.audio



Found existing installation: numpy 2.4.2
Uninstalling numpy-2.4.2:
  Successfully uninstalled numpy-2.4.2
Found existing installation: torch 2.2.2
Uninstalling torch-2.2.2:
  Successfully uninstalled torch-2.2.2
Found existing installation: torchvision 0.17.2
Uninstalling torchvision-0.17.2:
  Successfully uninstalled torchvision-0.17.2
Found existing installation: torchaudio 2.2.2
Uninstalling torchaudio-2.2.2:
  Successfully uninstalled torchaudio-2.2.2
Found existing installation: transformers 4.41.2
Uninstalling transformers-4.41.2:
  Successfully uninstalled transformers-4.41.2
[0m

In [1]:
!pip install numpy==1.26.4
!pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2
!pip install transformers==4.41.2 accelerate soundfile


Collecting torch==2.2.2
  Using cached torch-2.2.2-cp312-cp312-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision==0.17.2
  Using cached torchvision-0.17.2-cp312-cp312-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio==2.2.2
  Using cached torchaudio-2.2.2-cp312-cp312-manylinux1_x86_64.whl.metadata (6.4 kB)
Using cached torch-2.2.2-cp312-cp312-manylinux1_x86_64.whl (755.5 MB)
Using cached torchvision-0.17.2-cp312-cp312-manylinux1_x86_64.whl (6.9 MB)
Using cached torchaudio-2.2.2-cp312-cp312-manylinux1_x86_64.whl (3.3 MB)
Installing collected packages: torch, torchvision, torchaudio
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.2.2 requires transformers<6.0.0,>=4.41.0, which is not installed.
peft 0.18.1 requires transformers, which is not installed.[0m[31m
[0mSuccessfully installed torch-2.2.2 torchaudio-

In [4]:
import torch
import torchvision
import numpy as np

print("numpy:", np.__version__)
print("torch:", torch.__version__)
print("torchvision:", torchvision.__version__)


numpy: 1.26.4
torch: 2.2.2+cu121
torchvision: 0.17.2+cu121


In [8]:
from IPython.display import display, Javascript
from google.colab import output

def record_audio(seconds=20):
    js = f"""
    async function record() {{
        const stream = await navigator.mediaDevices.getUserMedia({{audio: true}});
        const recorder = new MediaRecorder(stream);
        let chunks = [];

        recorder.ondataavailable = e => chunks.push(e.data);
        recorder.start();
        await new Promise(r => setTimeout(r, {seconds * 1000}));
        recorder.stop();

        recorder.onstop = async () => {{
            const blob = new Blob(chunks, {{ type: 'audio/wav' }});
            const buffer = await blob.arrayBuffer();
            const bytes = new Uint8Array(buffer);
            google.colab.kernel.invokeFunction(
                'notebook.save_audio',
                [Array.from(bytes)],
                {{}}
            );
        }};
    }}
    record();
    """
    display(Javascript(js))

def save_audio(data):
    with open("input.wav", "wb") as f:
        f.write(bytes(data))

output.register_callback("notebook.save_audio", save_audio)

record_audio(10)


<IPython.core.display.Javascript object>

In [1]:
from transformers import pipeline

# Load ASR pipeline
asr = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-large-v3",
    device_map="auto",
    chunk_length_s=30,
    return_timestamps=True
)

# Transcribe audio file
result = asr("input.wav")

# Full transcript
print("Full transcript:")
print(result["text"])
print("\n--- Segments ---")

# Build structured output (Who / When / What)
final = []

for chunk in result["chunks"]:
    final.append({
        "speaker": "Speaker_Unknown",
        "start": round(chunk["timestamp"][0], 2),
        "end": round(chunk["timestamp"][1], 2),
        "text": chunk["text"].strip()
    })

# Display structured output
for f in final:
    print(
        f"[{f['start']} → {f['end']}] "
        f"{f['speaker']}: {f['text']}"
    )
import json

with open("transcript.json", "w") as f:
    json.dump(final, f, indent=2)

    #NOTE - Diarization uses pyannote.audio lib which is incompatible with python 3.12 version.. Diarization solves the WHO part.. i.e who is speaking in the conversation..Speaker1, Speaker2 and so on... What and When gets resolved automatically with or without diarization...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Full transcript:
 Hello, Hi, my name is Jai. Hi Jai, what are you doing? This is Keevan.

--- Segments ---
[0.0 → 4.0] Speaker_Unknown: Hello, Hi, my name is Jai.
[4.0 → 6.0] Speaker_Unknown: Hi Jai, what are you doing?
[6.0 → 10.62] Speaker_Unknown: This is Keevan.
