In [7]:
import os
import glob
import pandas as pd
from mit_ast_prob import MIT_AST_model_prob
from tqdm import tqdm


# Initialize the model
model = MIT_AST_model_prob()

def find_wav_files(directory):
    """
    Recursively find all .wav files in the given directory and its subdirectories.
    """
    return glob.glob(os.path.join(directory, '**', '*.wav'), recursive=True)

def filter_and_store_speech_files(directory):
    """
    Iterate over all .wav files in the directory and its subdirectories, 
    classify them, and store the filenames, predicted class, and top 5 classes with their probabilities in a DataFrame.
    """
    wav_files = find_wav_files(directory)
    data = []

    for wav_file in tqdm(wav_files, desc="Processing files"):
        label, top_5_class_probabilities = model.classify(wav_file)
        top_5_str = ", ".join([f"{cls}: {prob:.4f}" for cls, prob in top_5_class_probabilities.items()])
        data.append({
            'filename': wav_file,
            'predicted_class': label,
            'top_5_classes_probs': top_5_str
        })
    
    df = pd.DataFrame(data)
    return df




In [8]:
# let's check the model on the folder with garden sounds

folder_path = '/Users/evgenynazarenko/DACS_3_year/Thesis/GardenFiles23/garden_01012024/0'
speech_df = filter_and_store_speech_files(folder_path)
print(speech_df.head)


Processing files: 100%|██████████| 100/100 [00:14<00:00,  6.99it/s]

<bound method NDFrame.head of                                              filename  \
0   /Users/evgenynazarenko/DACS_3_year/Thesis/Gard...   
1   /Users/evgenynazarenko/DACS_3_year/Thesis/Gard...   
2   /Users/evgenynazarenko/DACS_3_year/Thesis/Gard...   
3   /Users/evgenynazarenko/DACS_3_year/Thesis/Gard...   
4   /Users/evgenynazarenko/DACS_3_year/Thesis/Gard...   
..                                                ...   
95  /Users/evgenynazarenko/DACS_3_year/Thesis/Gard...   
96  /Users/evgenynazarenko/DACS_3_year/Thesis/Gard...   
97  /Users/evgenynazarenko/DACS_3_year/Thesis/Gard...   
98  /Users/evgenynazarenko/DACS_3_year/Thesis/Gard...   
99  /Users/evgenynazarenko/DACS_3_year/Thesis/Gard...   

            predicted_class                                top_5_classes_probs  
0                      Fowl  Fowl: 0.1836, Animal: 0.0955, Quack: 0.0683, G...  
1                      Crow  Crow: 0.2252, Caw: 0.1161, Animal: 0.1031, Bir...  
2                    Animal  Animal: 0.131




In [9]:
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor

# Load the model and its configuration
model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
model = AutoModelForAudioClassification.from_pretrained(model_name)
extractor = AutoFeatureExtractor.from_pretrained(model_name)

# Get the id2label mapping
id2label = model.config.id2label

# Print all class labels
print("All class labels:")
for idx, label in id2label.items():
    print(f"{idx}: {label}")

# Identify and collect human speech-related classes
speech_related_classes = [label for label in id2label.values() if "speech" in label.lower() or "human" in label.lower()]

print("\nHuman speech-related classes:")
for label in speech_related_classes:
    print(label)


All class labels:
0: Speech
1: Male speech, man speaking
2: Female speech, woman speaking
3: Child speech, kid speaking
4: Conversation
5: Narration, monologue
6: Babbling
7: Speech synthesizer
8: Shout
9: Bellow
10: Whoop
11: Yell
12: Battle cry
13: Children shouting
14: Screaming
15: Whispering
16: Laughter
17: Baby laughter
18: Giggle
19: Snicker
20: Belly laugh
21: Chuckle, chortle
22: Crying, sobbing
23: Baby cry, infant cry
24: Whimper
25: Wail, moan
26: Sigh
27: Singing
28: Choir
29: Yodeling
30: Chant
31: Mantra
32: Male singing
33: Female singing
34: Child singing
35: Synthetic singing
36: Rapping
37: Humming
38: Groan
39: Grunt
40: Whistling
41: Breathing
42: Wheeze
43: Snoring
44: Gasp
45: Pant
46: Snort
47: Cough
48: Throat clearing
49: Sneeze
50: Sniff
51: Run
52: Shuffle
53: Walk, footsteps
54: Chewing, mastication
55: Biting
56: Gargling
57: Stomach rumble
58: Burping, eructation
59: Hiccup
60: Fart
61: Hands
62: Finger snapping
63: Clapping
64: Heart sounds, heartbeat
6

In [12]:
print(speech_related_classes)

['Speech', 'Male speech, man speaking', 'Female speech, woman speaking', 'Child speech, kid speaking', 'Speech synthesizer', 'Hubbub, speech noise, speech babble']


In [5]:
speech_df.head()

Unnamed: 0,filename,predicted_class,top_5_classes_probs
0,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...,Fowl,"Fowl: 0.1836, Animal: 0.0955, Quack: 0.0683, G..."
1,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...,Crow,"Crow: 0.2252, Caw: 0.1161, Animal: 0.1031, Bir..."
2,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...,Animal,"Animal: 0.1315, Rustling leaves: 0.1219, Flap:..."
3,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...,Bicycle,"Bicycle: 0.1228, Door: 0.0898, Speech: 0.0734,..."
4,/Users/evgenynazarenko/DACS_3_year/Thesis/Gard...,Door,"Door: 0.1739, Sliding door: 0.0692, Slam: 0.06..."
