In [None]:
pip install sonar-space fairseq2 torchaudio

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import torch
device = torch.device('cuda')

## Speech

In [None]:
import glob
import torchaudio

In [None]:
def load_audio(path, target_sampling_rate=16000):
    inp, sr = torchaudio.load(path)
    arr = torchaudio.functional.resample(inp, orig_freq=sr, new_freq=16000)
    return arr

In [None]:
filenames = []
audios = []
for filepath in glob.glob("/content/gdrive/MyDrive/Audio Retrieval/data/clips/*/*.wav"):
  filenames.append(f'{filepath.split("/")[-2]}_{filepath.split("/")[-1]}')
  audios.append(load_audio(filepath))

In [None]:
print(len(audios))

In [None]:
from sonar.models.sonar_speech.loader import load_sonar_speech_model
speech_encoder_model = load_sonar_speech_model("sonar_speech_encoder_eng", device=device).eval()

In [None]:
from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline
s2vec_model = SpeechToEmbeddingModelPipeline(encoder=speech_encoder_model)

In [None]:
embeddings = s2vec_model.predict(audios)

In [None]:
embeddings.shape

In [None]:
data = {}
for i in range(len(filenames)):
  data[filenames[i]] = embeddings[i].tolist()

In [None]:
import json
file_path = "/content/gdrive/MyDrive/Audio Retrieval/data/embeddings/sonar/data.json"

with open(file_path, "w") as json_file:
    json.dump(data, json_file)

## Text

In [None]:
from sonar.models.sonar_text import (
    load_sonar_text_encoder_model,
    load_sonar_tokenizer,
)
from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline

In [None]:
text_encoder_model = load_sonar_text_encoder_model("text_sonar_basic_encoder", device=device).eval()
text_tokenizer = load_sonar_tokenizer("text_sonar_basic_encoder")

In [None]:
text_embedding_pipeline = TextToEmbeddingModelPipeline(text_encoder_model, text_tokenizer)

In [None]:
queries = [
    "What is the difference between supervised and unsupervised learning?",
    "What is cross-validation?"
]

In [None]:
embeddings = text_embedding_pipeline.predict(queries, source_lang="eng_Latn")

In [None]:
data_dict = {}
for i,q in enumerate(queries):
  data_dict[q] = embeddings[i].tolist()

In [None]:
import json
file_path = "/content/gdrive/MyDrive/Audio Retrieval/data/embeddings/sonar/query.json"

with open(file_path, "w") as json_file:
    json.dump(data_dict, json_file)