In [1]:
from tqdm import tqdm
from pathlib import Path
import json
import re
import numpy as np
import torchaudio
from IPython.display import Audio, display
import textgrids
import IPython.display as ipd

In [2]:
K = 5

In [3]:
aud_files = Path("../../Datasets/flickr_audio")
save_dir = Path('../support_set')
save_dir.mkdir(parents=True, exist_ok=True)

In [4]:
vocab = []
with open('../data/test_keywords.txt', 'r') as f:
    for keyword in f:
        vocab.append(' '.join(keyword.split()))

In [5]:
alignments = {}
prev = ''
prev_wav = ''
prev_start = 0
with open(aud_files / 'flickr_8k.ctm', 'r') as f:
    for line in f:
        name, _, start, dur, label = line.strip().split()
        wav = name.split('.')[0] + '_' + name.split('#')[-1]
        label = label.lower()
        if label in vocab:
            if wav not in alignments: alignments[wav] = {}
            if label not in alignments[wav]: alignments[wav][label] = (float(start), float(start) + float(dur))
        prev = label
        prev_wav = wav
        prev_start = start

In [6]:
yoruba_alignments = {}
translation = {}
yoruba_vocab = []
with open(Path('../../Datasets/yfacc_v6/Flickr8k_text/eng_yoruba_keywords.txt'), 'r') as f:
    for line in f:
        e, y = line.strip().split(', ')
        if e in vocab:
            translation[e] = y
            yoruba_vocab.append(y)

print(yoruba_vocab)

label_counts = {}
for txt_grid in Path('../../Datasets/yfacc_v6/Flickr8k_alignment').rglob('*.TextGrid'):
    if str(txt_grid) == '../../Datasets/yfacc_v6/Flickr8k_alignment/3187395715_f2940c2b72_0.TextGrid': continue
    grid = textgrids.TextGrid(txt_grid)
    wav = txt_grid.stem
    
    for interval in grid['words']:
        
        x = str(interval).split()
        label = str(interval).split('"')[1]
        start = x[-2].split('=')[-1]
        dur = x[-1].split('=')[-1].split('>')[0]

        if label in yoruba_vocab:
            if wav not in yoruba_alignments: yoruba_alignments[wav] = {}
            if label not in yoruba_alignments[wav]: yoruba_alignments[wav][label] = (float(start), float(dur))
            if label not in label_counts: label_counts[label] = 0
            label_counts[label] += 1
            if label == 'ọmọ': print(wav)

['ọmọ', 'kẹ̀kẹ́', 'àwọn ajá', 'pápá', 'omi']
3220161734_77f42734b9_0
244571201_0339d8e8d1_0
229862312_1a0ba19dab_0
3143982558_9e2d44c155_0
191003285_edd8d0cf58_0
2901074943_041aba4607_0
2228022180_9597b2a458_0
2128119486_4407061c40_0
2393264648_a280744f97_0
1220401002_3f44b1f3f7_0
2934359101_cdf57442dc_0
2647049174_0fb47cee2e_0
2274992140_bb9e868bb8_0
2274992140_bb9e868bb8_0
2666205903_8d287669e1_0
2966552760_e65b22cd26_0
3201427741_3033f5b625_0
2616643090_4f2d2d1a44_0
2587818583_4aa8e7b174_0
2683963310_20dcd5e566_0
3025549604_38b86198f5_0
2652522323_9218afd8c2_0
2549968784_39bfbe44f9_0
2706766641_a9df81969d_0
2667015110_1670324a33_0
2206960564_325ed0c7ae_0
3030566410_393c36a6c5_0
2757803246_8aa3499d26_0
2608289957_044849f73e_0


In [7]:
for e in translation:
    print(e, translation[e])

baby ọmọ
bike kẹ̀kẹ́
dogs àwọn ajá
field pápá
water omi


In [8]:
for label in label_counts:
    print(label, label_counts[label])

omi 50
àwọn ajá 15
pápá 16
kẹ̀kẹ́ 27
ọmọ 29


In [9]:
set(yoruba_vocab).intersection(set(label_counts.keys()))

{'kẹ̀kẹ́', 'omi', 'pápá', 'àwọn ajá', 'ọmọ'}

In [10]:
set(yoruba_vocab) - set(label_counts.keys())

set()

In [11]:
support_set = {}

##################################
# Support set 
##################################
fn = Path('../data/test.json')
with open(fn, 'r') as f:
    train = json.load(f)
word_counts = {}
word_names = {}
yoruba_word_counts = {}

if Path(save_dir / Path('support_set.npz')).is_file():
    support_set = np.load(Path(save_dir / Path('support_set.npz')), allow_pickle=True)['support_set'].item()
    for name in support_set.copy():

        entry = support_set[name]
        word = entry[-2]
        if word not in word_counts: word_counts[word] = 0
        word_counts[word] += 1
        y_word = entry[-1]
        if y_word not in yoruba_word_counts: yoruba_word_counts[y_word] = 0
        yoruba_word_counts[y_word] += 1
        if word not in word_names: word_names[word] = []
        if name in word_names[word]: support_set.pop(name)
        else: word_names[word].append(name)
        if word_counts[word] > K: support_set.pop(name)

for word in vocab:
    # filtered = [(i, w, s) for i, w, s in train[word] if Path(w).stem in alignments and Path(w).stem not in support_set]
    y_word = translation[word]
    filtered = []
    images = []
    for i, e, y in train[word]:
        y_name = '_'.join(Path(y).stem.split('_')[1:]) 
        if Path(e).stem in alignments and Path(e).stem not in support_set and y_name in yoruba_alignments:
            if i not in images: 
                filtered.append((i, e, y))
                images.append(i)
#                 print(e, y)
    instances = np.arange(0, len(filtered))
    np.random.shuffle(instances)

    count = 0
    if word in word_counts:
        count = word_counts[word]
    
    for im, wav, yor in [filtered[i] for i in instances]:
        name = Path(wav).stem
        if name in support_set or count == K or name not in yoruba_alignments: continue
        if y_word not in yoruba_alignments[name]: continue
        dur = int(((float(alignments[name][word][1])-float(alignments[name][word][0])))*16000)
        offset = int(float(alignments[name][word][0])*16000)

        aud, sr = torchaudio.load(Path('..') / wav, frame_offset=offset, num_frames=dur)
#         torchaudio.save(Path('..') / Path('temp.wav'), aud, sr)
        play = aud.squeeze().numpy() 
        if play.shape[0] != 0:
            ipd.display(ipd.Audio(play, rate=sr))
            y_word = translation[word]
            y_dur = int((float(yoruba_alignments[name][y_word][1])-float(yoruba_alignments[name][y_word][0]))*48000)
            y_offset = int(float(yoruba_alignments[name][y_word][0])*48000)
            y_aud, y_sr = torchaudio.load(Path('..') / yor, frame_offset=y_offset, num_frames=y_dur)
            play = y_aud.squeeze().numpy()
            ipd.display(ipd.Audio(play, rate=y_sr))
            
            ans = input(f'{count} / {K} {word}({y_word}): ')
            if ans == 'y':

                save_name = Path(wav).stem  + '_' + word + '.wav'
                out_path = save_dir / Path(wav).parent.stem / Path(save_name)
                out_path.parent.mkdir(parents=True, exist_ok=True)
                torchaudio.save(out_path, aud, sr)

#                 y_word = translation[word]
#                 y_dur = int((float(yoruba_alignments[name][y_word][1])-float(yoruba_alignments[name][y_word][0]))*48000)
#                 y_offset = int(float(yoruba_alignments[name][y_word][0])*48000)
#                 aud, sr = torchaudio.load(Path('..') / yor, frame_offset=y_offset, num_frames=y_dur)

                save_name = Path(yor).stem  + '_' + y_word + '.wav'
                out_path = save_dir / Path('wavs') / Path(save_name)
                out_path.parent.mkdir(parents=True, exist_ok=True)
#                 play = aud.squeeze().numpy() 
#                 ipd.display(ipd.Audio(play, rate=sr))
                torchaudio.save(out_path, y_aud, y_sr)

                support_set[name] = (wav, im, yor, alignments[name][word][0], alignments[name][word][1], word, yoruba_alignments[name][y_word][0], yoruba_alignments[name][y_word][1], y_word)

                np.savez_compressed(
                    save_dir / Path('support_set'), 
                    support_set=support_set
                    )
                count += 1
            if count == K: break

0 / 5 baby(ọmọ): y


1 / 5 baby(ọmọ): y


2 / 5 baby(ọmọ): y


3 / 5 baby(ọmọ): y


4 / 5 baby(ọmọ): y


0 / 5 bike(kẹ̀kẹ́): y


1 / 5 bike(kẹ̀kẹ́): y


2 / 5 bike(kẹ̀kẹ́): y


3 / 5 bike(kẹ̀kẹ́): y


4 / 5 bike(kẹ̀kẹ́): y


0 / 5 dogs(àwọn ajá): y


1 / 5 dogs(àwọn ajá): y


2 / 5 dogs(àwọn ajá): y


3 / 5 dogs(àwọn ajá): y


4 / 5 dogs(àwọn ajá): y


0 / 5 field(pápá): y


1 / 5 field(pápá): y


2 / 5 field(pápá): y


3 / 5 field(pápá): y


4 / 5 field(pápá): y


0 / 5 water(omi): y


1 / 5 water(omi): y


2 / 5 water(omi): y


3 / 5 water(omi): y


4 / 5 water(omi): y


In [12]:
count = {}
for name in support_set:
    w = support_set[name][-1]
    if w not in count: count[w] = 0
    count[w] += 1
    
remove = []
for w in count:
    if count[w] != K:
        remove.append(w)
print(remove)
names = list(support_set.keys() )
for name in names:
    w = support_set[name][-1]
    if w in remove:
        del support_set[name]

[]


In [13]:
np.savez_compressed(
    save_dir / Path('support_set'), 
    support_set=support_set
    )

In [14]:
for name in support_set:
    print(support_set[name])
    break

('../Datasets/flickr_audio/wavs/2608289957_044849f73e_0.wav', '../Datasets/Flicker8k_Dataset/2608289957_044849f73e.jpg', '../Datasets/yfacc_v6/flickr_audio_yoruba_test/S001_2608289957_044849f73e_0.wav', 0.61, 0.95, 'baby', 0.8845192002538877, 1.123630593462393, 'ọmọ')


In [15]:
for name in support_set:
    print(name)

2608289957_044849f73e_0
2683963310_20dcd5e566_0
2128119486_4407061c40_0
3143982558_9e2d44c155_0
2616643090_4f2d2d1a44_0
136552115_6dc3e7231c_0
2544182005_3aa1332bf9_0
2891617125_f939f604c7_0
3192069971_83c5a90b4c_0
2084217208_7bd9bc85e5_0
263854883_0f320c1562_0
2782433864_5a0c311d87_0
293881927_ac62900fd4_0
244571201_0339d8e8d1_0
315880837_90db309bab_0
2295750198_6d152d7ceb_0
270724499_107481c88f_0
2414397449_2ac3b78e0d_0
249394748_2e4acfbbb5_0
2909875716_25c8652614_0
2453971388_76616b6a82_0
2900274587_f2cbca4c58_0
2748729903_3c7c920c4d_0
1772859261_236c09b861_0
2533642917_a5eace85e6_0


In [16]:
count = {}
for name in support_set:
    w = support_set[name][-1]
    if w not in count: count[w] = 0
    count[w] += 1

In [17]:
for w in count:
    print(w, count[w])

ọmọ 5
kẹ̀kẹ́ 5
àwọn ajá 5
pápá 5
omi 5
