In [1]:
import json
import os
from pprint import pprint

In [2]:
acc_folder = "../../data/en/acc_split"
train_folder = "../../data/en/train_split"

In [3]:
acc_results = dict()
for file in os.listdir(acc_folder):
    acc = os.path.splitext(file)[0]
    data = json.load(open(os.path.join(acc_folder, file)))
    n_samples, dur = 0, 0
    for obj in data:
        n_samples += 1
        dur += obj["duration"]
    acc_results[acc] = {
        "n_samples": n_samples,
        "dur": round(dur / 3600, 2)
    }


In [4]:
pprint(acc_results)

{'au': {'dur': 71.39, 'n_samples': 51108},
 'ca': {'dur': 85.63, 'n_samples': 59342},
 'de': {'dur': 79.02, 'n_samples': 40897},
 'hk': {'dur': 5.78, 'n_samples': 4260},
 'ie': {'dur': 12.98, 'n_samples': 9461},
 'in': {'dur': 148.06, 'n_samples': 99613},
 'ni': {'dur': 7.96, 'n_samples': 5968},
 'nz': {'dur': 15.75, 'n_samples': 11877},
 'ph': {'dur': 7.36, 'n_samples': 5105},
 'sc': {'dur': 24.46, 'n_samples': 15474},
 'sg': {'dur': 4.71, 'n_samples': 3365},
 'uk': {'dur': 178.34, 'n_samples': 134126},
 'us': {'dur': 520.61, 'n_samples': 382626},
 'za': {'dur': 11.58, 'n_samples': 8374}}


In [5]:
train_results = dict()
for file in os.listdir(train_folder):
    prepend, acc = os.path.splitext(file)[0].split("_")
    if acc not in train_results:
        train_results[acc] = dict()
    dur, n_samples = 0, 0
    fname = os.path.join(train_folder, file)
    try:
        for line in open(fname):
            n_samples += 1
            dur += json.loads(line)["duration"]
        train_results
        train_results[acc][prepend] = {
            "n_samples": n_samples,
            "dur": round(dur / 3600, 2)
        }
    except Exception:  # file does not exist
        print(f"{fname} does not exist")
        continue


In [6]:
pprint(train_results)

{'None': {'test': {'dur': 0.0, 'n_samples': 0}},
 'au': {'test': {'dur': 14.36, 'n_samples': 10222},
        'train': {'dur': 57.03, 'n_samples': 40886}},
 'ca': {'test': {'dur': 17.1, 'n_samples': 11868},
        'train': {'dur': 68.53, 'n_samples': 47474}},
 'de': {'test': {'dur': 15.83, 'n_samples': 8179},
        'train': {'dur': 63.19, 'n_samples': 32718}},
 'hk': {'test': {'dur': 5.78, 'n_samples': 4260}},
 'ie': {'test': {'dur': 12.98, 'n_samples': 9461}},
 'in': {'test': {'dur': 29.57, 'n_samples': 19923},
        'train': {'dur': 118.49, 'n_samples': 79690}},
 'ni': {'test': {'dur': 7.96, 'n_samples': 5968}},
 'nz': {'test': {'dur': 15.75, 'n_samples': 11877}},
 'ph': {'test': {'dur': 7.36, 'n_samples': 5105}},
 'sc': {'test': {'dur': 24.46, 'n_samples': 15474}},
 'sg': {'test': {'dur': 4.71, 'n_samples': 3365}},
 'uk': {'test': {'dur': 35.62, 'n_samples': 26825},
        'train': {'dur': 142.72, 'n_samples': 107301}},
 'us': {'test': {'dur': 104.17, 'n_samples': 76525},
     

In [7]:
# compute the distribution of data across accents
hours = [0 for _ in range(len(train_results))]
for i, acc in enumerate(train_results):
    for file in train_results[acc].values():
        hours[i] += file["dur"]

for i, acc in enumerate(train_results):
    print(acc, round(hours[i], 1), round(hours[i] / sum(hours), 2))

uk 178.3 0.15
None 0.0 0.0
ca 85.6 0.07
au 71.4 0.06
in 148.1 0.13
sg 4.7 0.0
sc 24.5 0.02
ni 8.0 0.01
us 520.6 0.44
ph 7.4 0.01
za 11.6 0.01
nz 15.8 0.01
ie 13.0 0.01
de 79.0 0.07
hk 5.8 0.0


In [8]:
# sum all train data that is not US, and compare to the US data
hours = 0
for acc in train_results:
    if "train" in train_results[acc] and acc != "us":
        hours += train_results[acc]["train"]["dur"]
print("accented: " + str(hours))
print("US: " + str(train_results["us"]["train"]["dur"]))

accented: 449.96
US: 416.45


In [11]:
# sum all train data that is not US, and compare to the US data
hours = 0
for acc in train_results:
    if "test" in train_results[acc] and acc != "us":
        hours += train_results[acc]["test"]["dur"]
print("accented: " + str(hours))
print("US: " + str(train_results["us"]["test"]["dur"]))

accented: 203.06000000000003
US: 104.17
