In [1]:
import pandas as pd
import os
from pathlib import Path
from glob import glob
from tqdm import tqdm
import torch
from torchaudio import load, info
from datasets import Dataset
from IPython.display import Audio
import subprocess

In [2]:
data_dir = Path("/data/xeno-canto")
mp3_dir = data_dir / "xeno_canto_full"
wav_dir = data_dir / "xeno_canto_full_32000"
reload_dir = Path("/data/reloaded_corr")

sr = 32000
max_frames = 5 * 60 * sr

In [3]:
metadata = Dataset.load_from_disk("meta_xeno_canto")
training_data = metadata.filter(lambda rec: rec["num_frames"] <= max_frames).select_columns(["file", "label"])
metadata

Loading cached processed dataset at /data/Max_workspace/meta_xeno_canto/cache-6b5331f568fbd550.arrow


Dataset({
    features: ['file', 'label', 'num_frames', 'num_channels'],
    num_rows: 681414
})

In [12]:
metadata.filter(lambda rec: rec["num_channels"] > 2)[0]

Loading cached processed dataset at /data/Max_workspace/meta_xeno_canto/cache-a35e87f04add1ee7.arrow


{'file': '/data/xeno-canto/xeno_canto_full_32000/Tyto tenebricosa/XC536934.wav',
 'label': 'A Tyto tenebricosa was recorded in Australia in March at morning. The sound is described as call. The soundquality is 3 out of 5.',
 'num_frames': 4121359,
 'num_channels': 3}

In [19]:
two_channel_files = metadata.filter(lambda rec: rec["num_channels"] == 2)["file"]

Loading cached processed dataset at /data/Max_workspace/meta_xeno_canto/cache-3f3855f11b6cf7a8.arrow


In [21]:
waveform, sr = load(two_channel_files[0])
print(waveform.shape)

torch.Size([2, 148794])


### Old Code

In [5]:
metadata.save_to_disk("meta_xeno_canto", max_shard_size="100MB")

PermissionError: Tried to overwrite /data/Max_workspace/meta_xeno_canto but a dataset can't overwrite itself.

In [23]:
def mapping(record):
    file = record["file"]
    file = file.replace(str(mp3_dir), str(wav_dir))
    file = file.replace(str(reload_dir), str(wav_dir))
    file = file.replace(".mp3", ".wav")
    
    return dict(file = file)
        
metadata = metadata.map(mapping, num_proc=8)

Loading cached processed dataset at /data/Max_workspace/metadata/cache-f7c70da52078d071_*_of_00008.arrow


In [33]:
metadata = metadata.add_column("num_frames", metrics_ds["num_frames"])
metadata = metadata.add_column("num_channels", metrics_ds["num_channels"])

In [192]:
def ffmpeg_call(file_name, out_dir, sample_rate=44100, in_dir=mp3_dir):
    
    in_file = Path(in_dir) / file_name
    out_file = Path(out_dir) / file_name
    
    # create target dir if not exists
    out_file.parent.mkdir(parents=True, exist_ok=True)
    
    result = subprocess.run(["ffmpeg", 
                             "-i", str(in_file), 
                             "-ar", str(sample_rate),
                             str(out_file)],
                            stdout = subprocess.DEVNULL, #supress regular output, errors are still printed
                            stderr = subprocess.STDOUT,
                           )

    return dict(ret_code = result.returncode)

In [202]:
return_codes = result.with_format("pandas")["ret_code"]
return_codes.max() # 0 menas all successful

0

In [196]:
result = corrupt_ds.map(lambda x: ffmpeg_call(x["file"], out_dir=reload_dir), num_proc=8)

                                                                              

In [3]:
meta_data_file = "xeno_canto_full_combined_durations.json"
dataset = pd.read_json(meta_data_file)
df = dataset[["file", "available"]]

In [64]:
def check_available(file):
    file = mp3_dir / file
    if not os.path.isfile(file):
        return False
    elif os.path.getsize(file) == 0:
        print(f"size 0 file @{file}")
        return False
    else:
        return True

result = df.file.apply(check_available)

KeyboardInterrupt: 

In [19]:
(result != df.available).sum()

27

In [24]:
actually_available = [] #false-negative
not_actually_available = [] #false-positive
for i in tqdm(df.index):
    available = df.loc[i, "available"]
    file = df.loc[i, "file"]
    actual = result.loc[i]
    if available and not actual:
        not_actually_available.append(file)
    if not available and actual:
        actually_available.append(file)
        
print(not_actually_available)
print(actually_available)

100%|██████████| 691930/691930 [00:29<00:00, 23858.43it/s]

[]
['Clangula hyemalis/XC195038.mp3', 'Pyrrhula pyrrhula/XC456932.mp3', 'Turdus merula/XC456971.mp3', 'Acrocephalus palustris/XC456981.mp3', 'Spinus spinus/XC456985.mp3', 'Periparus ater/XC457004.mp3', 'Emberiza schoeniclus/XC459261.mp3', 'Phylloscopus collybita/XC465779.mp3', 'Phylloscopus trochilus/XC465780.mp3', 'Anthus trivialis/XC465781.mp3', 'Prunella modularis/XC560551.mp3', 'Regulus regulus/XC594591.mp3', 'Spinus spinus/XC594598.mp3', 'Aegithalos caudatus/XC594602.mp3', 'Henicorhina leucophrys/XC626095.mp3', '_Unknown_/XC626504.mp3', '_Unknown_/XC629687.mp3', 'Lanius borealis/XC632264.mp3', 'Charadrius vociferus/XC652295.mp3', 'Geothlypis tolmiei/XC656203.mp3', 'Pheucticus melanocephalus/XC656229.mp3', 'Bonasa umbellus/XC657341.mp3', 'Empidonax minimus/XC657736.mp3', 'Contopus sordidulus/XC657784.mp3', 'Vireo olivaceus/XC658285.mp3', 'Cardellina pusilla/XC658291.mp3', 'Haliaeetus leucocephalus/XC698517.mp3']





note: recursive file count using
```console
find "xeno_canto_full" -type f | wc -l
```
returned 681441

In [21]:
print(df.shape)
print(df.available.sum())

(691930, 3)
681414
