In [None]:
!apt-get install poppler-utils ffmpeg
!pip install fsspec==2023.9.2
!pip install git+https://github.com/illuin-tech/colpali
!pip install pdf2image
!pip install openai
!pip install --no-deps fast-plaid fastkmeans
# !pip install torchvision --upgrade
!pip install flash-attn==2.7.3 --no-build-isolation
!pip install moviepy pydub
!pip install --force-reinstall https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz

In [15]:
from google.colab import files, userdata
from pydub import AudioSegment
import numpy as np
import io
from scipy.io import wavfile
import io
import base64
from scipy.io.wavfile import write
import requests
from moviepy.editor import VideoFileClip
from openai import OpenAI
from IPython.display import Audio, display


api_key = userdata.get('OPENAI')
client = OpenAI(api_key=api_key)

In [23]:
# uploaded = files.upload()  # Choose your .mp4 file
# # convert to WAV format
# video = VideoFileClip("<local_file>.mp4")
# audio = video.audio
# audio.write_audiofile("audio.wav")

In [3]:
# Download a youtube video from URL
!yt-dlp https://www.youtube.com/watch?v=9vM4p9NN0Ts --extract-audio --audio-format wav -o "audio.%(ext)s"

[youtube] Extracting URL: https://www.youtube.com/watch?v=9vM4p9NN0Ts
[youtube] 9vM4p9NN0Ts: Downloading webpage
[youtube] 9vM4p9NN0Ts: Downloading tv client config
[youtube] 9vM4p9NN0Ts: Downloading player 69b31e11-main
[youtube] 9vM4p9NN0Ts: Downloading tv player API JSON
[youtube] 9vM4p9NN0Ts: Downloading ios player API JSON
[youtube] 9vM4p9NN0Ts: Downloading m3u8 information
[info] 9vM4p9NN0Ts: Downloading 1 format(s): 251
[download] Destination: audio.webm
[K[download] 100% of   75.27MiB in [1;37m00:00:01[0m at [0;32m43.51MiB/s[0m
[ExtractAudio] Destination: audio.wav
Deleting original file audio.webm (pass -k to keep)


In [5]:
# chunk in 30s WAV files

audios = []

# Load original audio
audio = AudioSegment.from_wav("audio.wav")

# Set target frame rate
target_rate = 16000
chunk_length_ms = 30 * 1000  # 30 seconds

# Split and resample each chunk
for i in range(0, len(audio), chunk_length_ms):
    chunk = audio[i:i + chunk_length_ms]
    # Optional: Convert stereo to mono to simplify
    chunk = chunk.set_channels(1)

    # Resample the chunk
    chunk = chunk.set_frame_rate(target_rate)

    # Export and convert to numpy array
    buf = io.BytesIO()
    chunk.export(buf, format="wav")
    buf.seek(0)

    rate, data = wavfile.read(buf)
    audios.append(data)

print(f"Number of chunks: {len(audios)}")

Number of chunks: 210


In [None]:
display(Audio(audios[23], autoplay=True, rate=16000))

In [None]:
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor


model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")

In [9]:
def audio_to_base64(data, rate=16000):
    # Example: audios[1] and known sample rate (e.g., 16000)
    # Save to BytesIO buffer
    buf = io.BytesIO()
    write(buf, rate, data)
    buf.seek(0)

    # Encode to base64
    encoded_string = base64.b64encode(buf.read()).decode("utf-8")
    return encoded_string

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

In [10]:
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader

# Process the inputs by batches of 4
 # Run inference - docs
dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

100%|██████████| 105/105 [00:53<00:00,  1.95it/s]


In [22]:
print("ds[0].shape:", ds[0].shape)

ds[0].shape: torch.Size([804, 128])


In [24]:
query = "Explain LLM scaling laws ?"
res = get_results(query)
print(f"The best audio chunks are: {res}")
# display(Audio(audios[res[0]], autoplay=True, rate=16000))

The best audio chunks are: [102, 96, 35, 97, 108, 24, 92, 128, 84, 83]


In [20]:
content = [
              {
                  "type": "text",
                  "text": f"Answer the query using the audio files. Say which ones were used to answer. Query: {query}"
              }
            ]

for i in res[:5]:
  content += [{
                  "type": "text",
                  "text": f"The following is audio chunk # {i}."
              },
              {
              "type": "input_audio",
              "input_audio": {
                  "data": audio_to_base64(audios[i]),
                  "format": "wav"
              }}]

completion = client.chat.completions.create(
    model="gpt-4o-audio-preview",
    modalities=["text", "audio"],
    audio={"voice": "ballad", "format": "wav"},
    messages=[
        {
            "role": "user",
            "content": content
        },
    ]
)

print(f"Query: {query}")
print(f"Answer: {completion.choices[0].message.audio.transcript}")

Query: Explain LLM scaling laws ?
Answer: LLM scaling laws describe how the performance of large language models improves predictably as you scale up three main factors: model size (in parameters), the size of the dataset (in tokens), and the amount of compute (measured in FLOPs). These laws help guide decisions about how large a model should be given a fixed budget of compute resources.

From audio chunks 102 and 108, we learned that for a given amount of compute, you can predict the best size of a model (in terms of parameters) and how it will perform. These scaling laws indicate that scaling up the size of the model, dataset, or compute generally leads to better performance, as long as the scaling is done efficiently.

However, as mentioned in chunk 96 and 97, the laws are not perfectly rigid. Small architectural changes or different model designs can slightly shift the scaling curves, but the general trend remains the same: more compute and data lead to better models.

In summary, 

In [None]:
# Assuming completion.choices[0].message.audio.data is your base64 audio string
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)

with open("response.wav", "wb") as f:
    f.write(wav_bytes)
display(Audio("response.wav", autoplay=True))