In [None]:
!apt-get install poppler-utils ffmpeg
!pip install fsspec==2023.9.2
!pip install git+https://github.com/illuin-tech/colpali
!pip install pdf2image av
!pip install openai
# !pip install --no-deps fast-plaid fastkmeans
# !pip install torchvision --upgrade
!pip install flash-attn==2.7.3 --no-build-isolation
!pip install moviepy pydub
!pip install --force-reinstall https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz

In [None]:
from google.colab import files, userdata
from pydub import AudioSegment
import numpy as np
import io
from scipy.io import wavfile
import io
import base64
from scipy.io.wavfile import write
import requests
from moviepy.editor import VideoFileClip
from openai import OpenAI
from IPython.display import Audio, display


api_key = userdata.get('OPENAI')
client = OpenAI(api_key=api_key)

In [3]:
# uploaded = files.upload()  # Choose your .mp4 file
# # convert to WAV format
# video = VideoFileClip("<local_file>.mp4")
# audio = video.audio
# audio.write_audiofile("audio.wav")

In [4]:
# Download a youtube video from URL
# !yt-dlp https://www.youtube.com/watch?v=9vM4p9NN0Ts --extract-audio --audio-format wav -o "audio.%(ext)s"
!yt-dlp https://www.youtube.com/watch?v=lsbcN9-jU1Y --extract-audio --audio-format wav -o "audio.%(ext)s"

[youtube] Extracting URL: https://www.youtube.com/watch?v=lsbcN9-jU1Y
[youtube] lsbcN9-jU1Y: Downloading webpage
[youtube] lsbcN9-jU1Y: Downloading tv client config
[youtube] lsbcN9-jU1Y: Downloading player 69b31e11-main
[youtube] lsbcN9-jU1Y: Downloading tv player API JSON
[youtube] lsbcN9-jU1Y: Downloading ios player API JSON
[youtube] lsbcN9-jU1Y: Downloading m3u8 information
[info] lsbcN9-jU1Y: Downloading 1 format(s): 251
[download] Destination: audio.webm
[K[download] 100% of   27.26MiB in [1;37m00:00:01[0m at [0;32m17.07MiB/s[0m
[ExtractAudio] Destination: audio.wav
Deleting original file audio.webm (pass -k to keep)


In [5]:
# chunk in 30s WAV files

audios = []

# Load original audio
audio = AudioSegment.from_wav("audio.wav")

# Set target frame rate
target_rate = 16000
chunk_length_ms = 30 * 1000  # 30 seconds

# Split and resample each chunk
for i in range(0, len(audio), chunk_length_ms):
    chunk = audio[i:i + chunk_length_ms]
    # Optional: Convert stereo to mono to simplify
    chunk = chunk.set_channels(1)

    # Resample the chunk
    chunk = chunk.set_frame_rate(target_rate)

    # Export and convert to numpy array
    buf = io.BytesIO()
    chunk.export(buf, format="wav")
    buf.seek(0)

    rate, data = wavfile.read(buf)
    audios.append(data)

print(f"Number of chunks: {len(audios)}")

Number of chunks: 57


In [None]:
# Let's verify the audio sounds normal at 16k Hz
display(Audio(audios[23], autoplay=False, rate=16000))

In [None]:
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor


model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")

In [8]:
def audio_to_base64(data, rate=16000):
    # Example: audios[1] and known sample rate (e.g., 16000)
    # Save to BytesIO buffer
    buf = io.BytesIO()
    write(buf, rate, data)
    buf.seek(0)

    # Encode to base64
    encoded_string = base64.b64encode(buf.read()).decode("utf-8")
    return encoded_string

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

###1. Embedding the audio corpus (offline)

In [33]:
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader

# Process the input audios by batches of 4
dataloader = DataLoader(
    dataset=audios,
    batch_size=4,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x))

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

100%|██████████| 15/15 [00:08<00:00,  1.74it/s]


In [34]:
print("ds[0].shape:", ds[0].shape)

ds[0].shape: torch.Size([804, 128])


###2. Embedding the query and matching the most relevant audios

In [35]:
query = "Was Hannibal well liked by his men?"
res = get_results(query)
print(f"The best audio chunks are: {res}")
# display(Audio(audios[res[0]], autoplay=False, rate=16000))

The best audio chunks are: [26, 49, 42, 52, 51, 45, 44, 41, 43, 30]


In [26]:
content = [
    {
        "type": "text",
        "text": f"Answer the query using the audio files. Say which ones were used to answer. Query: {query}"
    }]

for i in res[:5]:
  content += [{
      "type": "text",
      "text": f"The following is audio chunk # {i}."
    },
    {
      "type": "input_audio",
      "input_audio": {
          "data": audio_to_base64(audios[i]),
          "format": "wav"
    }}]

completion = client.chat.completions.create(
    model="gpt-4o-audio-preview",
    modalities=["text", "audio"],
    audio={"voice": "ballad", "format": "wav"},
    messages=[
        {
            "role": "user",
            "content": content
        },
    ]
)

print(f"Query: {query}")
print(f"Answer: {completion.choices[0].message.audio.transcript}")

Query: Was Hannibal well liked by his men?
Answer: Based on the information from audio chunk 26, it mentions that Hannibal's men readily accepted him as their leader and that he had their total respect. This directly indicates that he was well liked by his men. Therefore, only the information from chunk 26 was used to answer the query.


In [None]:
# Assuming completion.choices[0].message.audio.data is your base64 audio string
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)

with open("response.wav", "wb") as f:
    f.write(wav_bytes)
display(Audio("response.wav", autoplay=False))

# And videos?

In [28]:
# Beware of OOM
videos = ["http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerEscapes.mp4", "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4", "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4"]

In [29]:
dataloader = DataLoader(
    dataset=videos,
    batch_size=1,
    shuffle=False,
    collate_fn=lambda x: processor.process_videos(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

  0%|          | 0/3 [00:00<?, ?it/s]Unused or unrecognized kwargs: images.
 33%|███▎      | 1/3 [00:05<00:10,  5.31s/it]Unused or unrecognized kwargs: images.
 67%|██████▋   | 2/3 [00:09<00:04,  4.58s/it]Unused or unrecognized kwargs: images.
100%|██████████| 3/3 [00:13<00:00,  4.41s/it]


In [None]:
from IPython.display import Video

queries = ["A dragon spitting fire"]

# Process the inputs
batch_queries = processor.process_queries(queries).to(model.device)

# Forward pass
with torch.no_grad():
    query_embeddings = model(**batch_queries)

scores = processor.score_multi_vector(query_embeddings, ds)
# print(f"The best video is video #{scores[0].argmax()}")

Video(videos[scores[0].argmax()], width=800)