In [42]:
#for a given csv file, iterate through the column filename and find the associated wav files. There should be as many wav files as there are rows in the csv file. Collect the paths to the wav files and the data from the csv file into a pandas dataframe.
import os
import pandas as pd

def collect_wav_and_csv(root_dir):
    data = []

    for subdir, _, files in os.walk(root_dir):
        csv_files = [f for f in files if f.endswith('.csv')]
        
        for csv_file in csv_files:
            csv_path = os.path.join(subdir, csv_file)
            df = pd.read_csv(csv_path)

            for _, row in df.iterrows():
                filename = row['filename']
                wav_path = os.path.join(subdir, filename)
                
                if os.path.exists(wav_path):
                    print(f"Found: {wav_path}")
                    entry = row.to_dict()
                    entry['wav_path'] = wav_path
                    data.append(entry)
                else:
                    print(f"Warning: {wav_path} does not exist.")

    return pd.DataFrame(data)



root_directory = '../Datasets/SmellySongs23K'  # replace with your dataset path
collected_data = collect_wav_and_csv(root_directory)

print("finished")

Found: ../Datasets/SmellySongs23K/000001.wav
Found: ../Datasets/SmellySongs23K/000002.wav
Found: ../Datasets/SmellySongs23K/000003.wav
Found: ../Datasets/SmellySongs23K/000004.wav
Found: ../Datasets/SmellySongs23K/000005.wav
Found: ../Datasets/SmellySongs23K/000006.wav
Found: ../Datasets/SmellySongs23K/000007.wav
Found: ../Datasets/SmellySongs23K/000008.wav
Found: ../Datasets/SmellySongs23K/000009.wav
Found: ../Datasets/SmellySongs23K/000010.wav
Found: ../Datasets/SmellySongs23K/000011.wav
Found: ../Datasets/SmellySongs23K/000012.wav
Found: ../Datasets/SmellySongs23K/000013.wav
Found: ../Datasets/SmellySongs23K/000014.wav
Found: ../Datasets/SmellySongs23K/000015.wav
Found: ../Datasets/SmellySongs23K/000016.wav
Found: ../Datasets/SmellySongs23K/000017.wav
Found: ../Datasets/SmellySongs23K/000018.wav
Found: ../Datasets/SmellySongs23K/000019.wav
Found: ../Datasets/SmellySongs23K/000020.wav
Found: ../Datasets/SmellySongs23K/000021.wav
Found: ../Datasets/SmellySongs23K/000022.wav
Found: ../

In [43]:
#print 5 first entries
print(collected_data.head())


     filename  is_AI                               wav_path
0  000001.wav      0  ../Datasets/SmellySongs23K/000001.wav
1  000002.wav      0  ../Datasets/SmellySongs23K/000002.wav
2  000003.wav      0  ../Datasets/SmellySongs23K/000003.wav
3  000004.wav      0  ../Datasets/SmellySongs23K/000004.wav
4  000005.wav      0  ../Datasets/SmellySongs23K/000005.wav


In [50]:

#create a classifier to predict the 'is_AI' column from the information encoded in the wav files
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import wave
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
# Extract waveform from wav files using wave
def extract_features(wav_path):
    print(f"Processing: {wav_path}")
    y = wave.open(wav_path, 'rb')
    n_frames = y.getnframes()
    frames = y.readframes(n_frames)
    y.close()
    y = np.frombuffer(frames, dtype=np.int16)
    # return simple features: mean and std
    return np.array([np.mean(y), np.std(y)])
# Prepare feature matrix and labels. pick as many samples as possible, but retain a 50/50 balance between classes. make sure to shuffle the data.
min_class_size = min(collected_data['is_AI'].value_counts())
collected_data2 = pd.concat([collected_data[collected_data['is_AI'] == 0].sample(min_class_size, random_state=42),
                             collected_data[collected_data['is_AI'] == 1].sample(min_class_size, random_state=42)])
collected_data2 = collected_data2.sample(frac=1, random_state=42).reset_index(drop=True)
#print size of collected_data2
print(f"Size of collected_data2: {collected_data2.shape}")

features = []
labels = collected_data2['is_AI'].values
for wav_path in collected_data2['wav_path']:
    feat = extract_features(wav_path)
    features.append(feat)




Size of collected_data2: (9912, 3)
Processing: ../Datasets/SmellySongs23K/017175.wav
Processing: ../Datasets/SmellySongs23K/021943.wav
Processing: ../Datasets/SmellySongs23K/004075.wav
Processing: ../Datasets/SmellySongs23K/007625.wav
Processing: ../Datasets/SmellySongs23K/005703.wav
Processing: ../Datasets/SmellySongs23K/009045.wav
Processing: ../Datasets/SmellySongs23K/017457.wav
Processing: ../Datasets/SmellySongs23K/018715.wav
Processing: ../Datasets/SmellySongs23K/022154.wav
Processing: ../Datasets/SmellySongs23K/012993.wav
Processing: ../Datasets/SmellySongs23K/008963.wav
Processing: ../Datasets/SmellySongs23K/020794.wav
Processing: ../Datasets/SmellySongs23K/016860.wav
Processing: ../Datasets/SmellySongs23K/018355.wav
Processing: ../Datasets/SmellySongs23K/018998.wav
Processing: ../Datasets/SmellySongs23K/010878.wav
Processing: ../Datasets/SmellySongs23K/016422.wav
Processing: ../Datasets/SmellySongs23K/020925.wav
Processing: ../Datasets/SmellySongs23K/021271.wav
Processing: ../

In [51]:
X = np.array(features)
y = labels
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
#print size of training and testing sets
print(f"Size of X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Size of X_test: {X_test.shape}, y_test: {y_test.shape}")
# Create a pipeline with scaling and classifier
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=42))
# Train the classifier
pipeline.fit(X_train, y_train)
# Make predictions
y_pred = pipeline.predict(X_test)
# Print classification report
print(classification_report(y_test, y_pred))  
accuracy = np.mean(y_pred == y_test)
print(f"\n\n\nClassifier Accuracy: {accuracy*100:.2f}%")

Size of X_train: (8920, 2), y_train: (8920,)
Size of X_test: (992, 2), y_test: (992,)
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       506
           1       0.86      0.87      0.86       486

    accuracy                           0.87       992
   macro avg       0.87      0.87      0.87       992
weighted avg       0.87      0.87      0.87       992




Classifier Accuracy: 86.59%
