In [3]:
import json
from pathlib import Path

from matplotlib import pyplot as plt
%matplotlib inline

import seaborn as sns

def load_manifest(filepath):
    data = []
    with Path(filepath).open("r") as fin:
        for line in fin.readlines():
            line = line.strip()
            if not line:
                continue
            data.append(json.loads(line))
    return data

def get_durations(data):
    results = []
    for item in data:
        results.append(item["duration"])
    return results

def save_manifest(data, filepath):
    with Path(filepath).open("w") as fout:
        for item in data:
            fout.write(f"{json.dumps(item)}\n")

def get_labels(data):
    results = []
    for item in data:
        labels = item['label'].split()
        results.append(labels)
    return results

def get_concat_labels(data):
    results = []
    for item in data:
        labels = item['label'].split()
        results += labels
    return results

In [7]:
manifests_dir = Path("/media/data/projects/NeMo-vad/project/manifests_draco/")
manifests_list = list(manifests_dir.glob("*.json"))
manifests_list = [x for x in manifests_list if "train" in str(x)]
print(manifests_list)

[PosixPath('/media/data/projects/NeMo-vad/project/manifests_draco/french_train_40ms.json'), PosixPath('/media/data/projects/NeMo-vad/project/manifests_draco/german_train_40ms.json'), PosixPath('/media/data/projects/NeMo-vad/project/manifests_draco/mandarin_train_40ms.json'), PosixPath('/media/data/projects/NeMo-vad/project/manifests_draco/russian_train_40ms.json'), PosixPath('/media/data/projects/NeMo-vad/project/manifests_draco/spanish_train_40ms.json')]


In [9]:
durations = []
for manifest in manifests_list:
    data = load_manifest(manifest)
    durations_i = get_durations(data)
    sns.displot(durations_i)

'french_train_40ms.json'

In [2]:
manifests_list = sorted([str(x) for x in Path("../manifests_cleaned/train").glob("*train*.json")])
print(manifests_list)

['../manifests_cleaned/train/ami_train_40ms_local_cleaned.json', '../manifests_cleaned/train/fisher_2004_40ms_local_cleaned_train.json', '../manifests_cleaned/train/fisher_2005_40ms_local_cleaned_train.json', '../manifests_cleaned/train/freesound_train_262h.json', '../manifests_cleaned/train/french_train_40ms_local_cleaned.json', '../manifests_cleaned/train/german_train_40ms_local_cleaned.json', '../manifests_cleaned/train/icsi_all_40ms_local_train_cleaned.json', '../manifests_cleaned/train/mandarin_train_40ms_local_cleaned.json', '../manifests_cleaned/train/musan_train_44h.json', '../manifests_cleaned/train/russian_train_40ms_local_cleaned.json', '../manifests_cleaned/train/spanish_train_40ms_local_cleaned.json']


In [4]:
from collections import Counter

all_data = []
all_labels = []

for manifest in manifests_list:
    data = load_manifest(manifest)
    all_data += data
    all_labels += get_concat_labels(data)

counter = Counter(all_labels)
zeros = counter['0']
ones = counter['1']
total = zeros + ones
print(zeros/total, ones/total)
print(total/zeros, total/ones)
# output_file = "../manifests_cleaned/train_manifest.json"
# save_manifest(all_data, output_file)
# print("Done")

0.421594333078971 0.578405666921029
2.3719483909967187 1.7288903916228957


In [5]:
train_data = load_manifest("../manifests_cleaned/train_manifest.json")
train_labels = get_labels(train_data)

In [7]:
from collections import Counter
all_train_labels = []
for labels in train_labels:
    all_train_labels += labels
counter = Counter(all_train_labels)
zeros = counter['0']
ones = counter['1']
total = zeros + ones
print(zeros/total, ones/total)
print(total/zeros, total/ones)

0.18365361504543226 0.8163463849545677
5.44503302999301 1.224970206802169
