# **Cree Audio Simple KNN Matcher**

In [14]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from IPython.display import Audio, display
from pathlib import Path
import warnings

In [15]:
warnings.filterwarnings("ignore")

In [16]:
AUDIO_DIR = "../data/audio"
SAMPLE_RATE = 16000
N_MFCC = 13

1. Load and extract MFCC features

In [17]:
def extract_mfcc(path, sr=SAMPLE_RATE, n_mfcc=N_MFCC):
    try:
        y, sr = librosa.load(path, sr=sr)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc_mean = np.mean(mfcc, axis=1)
        return mfcc_mean
    except Exception as e:
        print(f"Failed to process {path}: {e}")
        return None

2. Build dataset of features and labels

In [18]:
def build_dataset(audio_dir):
    entries = []
    paths = []
    for fname in os.listdir(audio_dir):
        if fname.endswith(".mp3") or fname.endswith(".wav"):
            fpath = Path(audio_dir) / fname
            label = fname.replace(".mp3", "").replace(".wav", "").replace("_", " ")
            mfcc = extract_mfcc(fpath)
            if mfcc is not None:
                entries.append((label, mfcc))
                paths.append(str(fpath))
    if not entries:
        raise ValueError("No valid audio files found.")
    labels, features = zip(*entries)
    return np.vstack(features), list(labels), paths

In [19]:
# Load dataset
print("Extracting features from audio files...")
features, labels, paths = build_dataset(AUDIO_DIR)
print(f"Loaded {len(labels)} audio samples.")

Extracting features from audio files...
Loaded 975 audio samples.


In [29]:
# array of audio fingerprints (MFCCs)
features

array([[-2.86015045e+02,  7.79737701e+01,  3.37328911e+00, ...,
        -1.66387081e+00,  5.34664154e+00,  2.33658552e+00],
       [-3.10722870e+02,  1.12454071e+02,  7.91447830e+00, ...,
         1.26580238e+00,  3.68362117e+00,  1.10222836e+01],
       [-2.41720184e+02,  1.20512955e+02,  1.91991825e+01, ...,
         1.68058896e+00, -5.42074251e+00,  2.87276030e+00],
       ...,
       [-3.34146362e+02,  1.06140472e+02,  4.96253777e+00, ...,
         5.64592600e+00, -2.37746382e+00,  7.48523712e+00],
       [-2.74082642e+02,  9.88003616e+01,  2.04507351e+01, ...,
         1.87584794e+00, -9.71282925e-03,  3.47740078e+00],
       [-2.84845703e+02,  9.97648697e+01,  2.42921715e+01, ...,
         2.76992273e+00, -7.52713251e+00,  4.94268703e+00]], dtype=float32)

In [30]:
# corresponding word(s) spoken in each audio
labels

['acahkosak',
 'achimoh',
 'achimostamawâw',
 'achimostaw',
 'achimostawihk',
 'achimostawik',
 'achimostawin',
 'achimostawinan',
 'achimostawāw',
 'achimostawēw',
 'achimostawēwak',
 'achimâw',
 'achimēwak',
 'achiwinam',
 'achiwpayin',
 'achiwīpayiw',
 'acim',
 'acimosisak',
 'acosis',
 'acoskēwinis',
 'ahcanis',
 'ahchi',
 'ahchipiko',
 'ahcānisak',
 'ahih',
 'ahihk',
 'ahik',
 'ahin',
 'ahkamēyihtam',
 'ahkamēyimoh',
 'ahkamēyimōk',
 'ahkwachāw',
 'ahkwakamin',
 'ahkwan',
 'ahkwatihtā',
 'ahkwatin',
 'ahkwātisōwin',
 'ahpō',
 'ahtastā',
 "ahtoskēwo' kamik",
 'ahāw',
 'ahēw',
 'ahēwak',
 'akahkway',
 'akahkwayak',
 'akawatamowin',
 'akayāsemoh',
 'akayāsemōwin',
 'akihchike',
 'akihcikēwin',
 'akihta',
 'akihtamaw',
 'akihtāsowin',
 'akihtāsōna',
 'akim',
 'akos',
 'akoskôwahchikē',
 'akotā',
 'akwaha',
 'akwahpisōwin',
 'akwamohcikēwin',
 'akwana',
 'akwanahikana',
 'akwâna',
 'akwānahā',
 'akāwācikēwin',
 'akāwāstēskamok',
 'amaciwēnēkānohtēwin',
 'amisk',
 'amiskosākahikan',
 'a

3. Fit Nearest Neighbors model

This prepares a search engine: “If I give you a new fingerprint, tell me the closest 3 matches.”

In [24]:
knn = NearestNeighbors(n_neighbors=3, metric='cosine')
knn.fit(features)

4. Function to query new audio

In [25]:
def query_audio(audio_path):
    print(f"\n🔎 Querying: {audio_path}")
    vec = extract_mfcc(audio_path).reshape(1, -1)
    distances, indices = knn.kneighbors(vec)
    print("\n🎯 Top Matches:")
    for i, idx in enumerate(indices[0]):
        print(f"{i+1}. {labels[idx]} (distance: {distances[0][i]:.3f})")
        display(Audio(paths[idx]))

In [26]:
query_audio("../data/audio/acim.mp3")


🔎 Querying: ../data/audio/acim.mp3

🎯 Top Matches:
1. acim (distance: 0.000)


2. itwa (distance: 0.001)


3. māci (distance: 0.002)


# **Cree Audio Whisper + KNN Matcher**

In [53]:
import os
import torch
import torchaudio
import librosa
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from transformers import WhisperProcessor, WhisperModel
from IPython.display import Audio, display
import json
import joblib
import streamlit as st

In [32]:
# Set path to audio files
AUDIO_DIR = "../data/audio/" 

Load Audio Files

In [33]:
def load_audio(path, sample_rate=16000):
    wav, sr = torchaudio.load(path)
    if sr != sample_rate:
        wav = torchaudio.functional.resample(wav, sr, sample_rate)
    return wav.mean(dim=0).numpy(), sample_rate  # mono

Load Whisper model and processor


In [34]:
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model = WhisperModel.from_pretrained("openai/whisper-large-v3")
model.eval()
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Extract Whisper embedding (use encoder output)

In [35]:
def extract_whisper_embedding(path):
    audio, sr = load_audio(path)
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
    input_features = inputs.input_features.to(model.device)
    with torch.no_grad():
        encoder_out = model.encoder(input_features)[0]
    return encoder_out.mean(dim=1).cpu().numpy().squeeze()  # average embedding

Build dataset (features + labels)

In [36]:
def build_dataset(audio_dir):
    features, labels, paths = [], [], []
    for fname in os.listdir(audio_dir):
        if fname.endswith(('.mp3', '.wav')):
            label = fname.replace(".mp3", "").replace(".wav", "").replace("_", " ")
            path = os.path.join(audio_dir, fname)
            try:
                emb = extract_whisper_embedding(path)
                features.append(emb)
                labels.append(label)
                paths.append(path)
            except:
                print(f"Failed: {fname}")
    return np.stack(features), labels, paths

In [37]:
print("🔄 Extracting embeddings from dataset...")
features, labels, paths = build_dataset(AUDIO_DIR)
print(f"✅ Done. Total samples: {len(labels)}")

🔄 Extracting embeddings from dataset...
✅ Done. Total samples: 975


Fit KNN model

In [45]:
knn = NearestNeighbors(n_neighbors=3, metric="cosine")
knn.fit(features)

Saving features, Labels, Paths and Model

In [46]:
# Save features
np.save("../models/audio/features.npy", features)

# Save labels and paths
with open("../models/audio/labels.json", "w", encoding="utf-8") as f:
    json.dump(labels, f)
with open("../models/audio/paths.json", "w", encoding="utf-8") as f:
    json.dump(paths, f)

# Save KNN model
joblib.dump(knn, "../models/audio/knn_model.pkl")


['../models/audio/knn_model.pkl']

Load saved model + data

In [None]:
features = np.load("../models/audio/features.npy")
with open("../models/audio/labels.json", "r", encoding="utf-8") as f:
    labels = json.load(f)
with open("../models/audio/paths.json", "r", encoding="utf-8") as f:
    paths = json.load(f)

knn = joblib.load("../models/audio/knn_model.pkl")


Search function to match query audio

In [49]:
def query_audio(audio_path, top_k=3):
    print(f"\n🔎 Querying: {audio_path}")
    query_emb = extract_whisper_embedding(audio_path).reshape(1, -1)
    distances, indices = knn.kneighbors(query_emb, n_neighbors=top_k)
    print("\n🎯 Top Matches:")
    for i, idx in enumerate(indices[0]):
        print(f"{i+1}. {labels[idx]} (distance: {distances[0][i]:.2f})")
        display(Audio(paths[idx]))

In [50]:
# ✅ Example usage:
query_audio("../data/audio/akwaha.mp3")


🔎 Querying: ../data/audio/akwaha.mp3

🎯 Top Matches:
1. akwaha (distance: 0.00)


2. mākwā (distance: 0.00)


3. tako (distance: 0.00)


UI with Streamlit

In [54]:
# st.title("🧠 Cree Word Audio Matcher")
# st.markdown("Upload a Cree word audio file (MP3 or WAV) and get the closest known matches.")

# uploaded_file = st.file_uploader("Upload audio file", type=["wav", "mp3"])

# if uploaded_file is not None:
#     with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[-1]) as tmp:
#         tmp.write(uploaded_file.read())
#         tmp_path = tmp.name

#     st.audio(tmp_path)
#     with st.spinner("Analyzing..."):
#         query_emb = extract_whisper_embedding(tmp_path).reshape(1, -1)
#         distances, indices = knn.kneighbors(query_emb, n_neighbors=5)

#     st.success("Top Matches:")
#     for i, idx in enumerate(indices[0]):
#         st.write(f"{i+1}. **{labels[idx]}** (distance: {distances[0][i]:.2f})")
#         st.audio(paths[idx])

2025-06-20 10:07:12.303 
  command:

    streamlit run c:\Users\Hend-PC\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
