In [1]:
from typing import Union, Optional, List, Tuple
from pathlib import Path
import os
from torch import Tensor
from torchaudio.datasets.utils import _load_waveform
import torch

import csv
import json
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import Dataset

SAMPLE_RATE = 48000
ALL_LANGUAGES = ["en"] #, "es"]
FOLDER_AUDIO = "clips"

In [2]:
root = "//scratch/p306982/data/fscil/FSCIL_subset/"
base_languages = ['en', 'fr', 'ca', 'de', 'rw']
inc_languages = ['fa', 'eo', 'pt', 'eu', 'pl', 'cy', 'nl', 'ru', 'es', 'it']

In [3]:
files_dict = {}
writer_dict = {}
for lang in inc_languages:
    f = open(os.path.join(root, lang, 'evaluation.csv'), 'w')
    files_dict[lang] = f
    writer = csv.writer(files_dict[lang])
    writer.writerow(['LINK', 'WORD', 'LANGUAGE'])
    writer_dict[lang] = writer

In [4]:
inc_word_count = {}
for lang in inc_languages:
    inc_word_count[lang] = {}
    words = os.listdir(os.path.join(root, lang, 'clips'))
    for word in words:
        inc_word_count[lang][word]=0
        clips = os.listdir(os.path.join(root, lang, 'clips', word))
        for clip in clips:
            if inc_word_count[lang][word] <200:
                path = os.path.join(word, clip)
                writer_dict[lang].writerow([path, word, lang])
                inc_word_count[lang][word] +=1

In [5]:
for lang in inc_languages:
    files_dict[lang].close()

In [7]:
base_train_f = open(os.path.join(root, 'base_train.csv'), 'w')
base_val_f = open(os.path.join(root, 'base_val.csv'), 'w')
base_test_f = open(os.path.join(root, 'base_test.csv'), 'w')
evaluation_f = open(os.path.join(root, 'evaluation.csv'), 'w')
writer_base_train = csv.writer(base_train_f)
writer_base_val = csv.writer(base_val_f)
writer_base_test = csv.writer(base_test_f)
writer_evaluation = csv.writer(evaluation_f)
header = ['LINK', 'WORD', 'LANGUAGE']
writer_base_train.writerow(header)
writer_base_val.writerow(header)
writer_base_test.writerow(header)
writer_evaluation.writerow(header)

20

In [10]:
base_train_f.close()
base_val_f.close()
base_test_f.close()
evaluation_f.close()

In [10]:
os.path.join('accept', 'test.opus')

'accept/test.opus'

In [11]:
ALL_WORDS = []
languages = base_languages + inc_languages
for lang in languages:
    words = os.listdir(os.path.join(root, lang, 'clips'))
    ALL_WORDS += words

In [9]:
base_word_count = {}
for lang in base_languages:
    base_word_count[lang] = {}
    words = os.listdir(os.path.join(root, lang, 'clips'))
    for word in words:
        base_word_count[lang][word]={'train': 0, 'val': 0, 'test': 0}
        clips = os.listdir(os.path.join(root, lang, 'clips', word))
        for clip in clips:
            path = os.path.join(word, clip)
            if base_word_count[lang][word]['train'] <500:
                writer_base_train.writerow([path, word, lang])
                base_word_count[lang][word]['train'] +=1
            elif base_word_count[lang][word]['val'] <100:
                writer_base_val.writerow([path, word, lang])
                base_word_count[lang][word]['val'] +=1
            elif base_word_count[lang][word]['test'] <100:
                writer_base_test.writerow([path, word, lang])
                base_word_count[lang][word]['test'] +=1

In [31]:
for lang in base_word_count:
    for word in base_word_count[lang]:
        if base_word_count[lang][word] != {'train': 500, 'val': 100, 'test': 100}:
            print(word)

In [20]:
len()

{'میخواستم': 200,
 'امیدوارم': 200,
 'تلویزیون': 200,
 'آپارتمان': 200,
 'بینالمللی': 200,
 'بنابراین': 200,
 'اسفندیار': 200,
 'پرسپولیس': 200,
 'بفرمایید': 200,
 'میتوانند': 200}

In [None]:
def generate_mswc_fscil_splits(root: Union[str, Path], 
                               languages: List[str] = None, 
                               visualize: Optional[bool] = False):
    """
    Generate new MSWC split for a few-shot class-incremental (FSCIL) learning scenario with the following split.
    100 base classes with 500 train, 100 validation and 100 test samples each.
    100 evaluation classes with 200 samples each (to use in a 10 sessions of 10 way set-up with N shots support to train on per class and the rest as a query to evaluate performance).
    The 200 classes are arbitrarily chosen as common voice command words.
    The base ones are then the 100 of these with the most clips (at least 700) per sample and the evaluation ones as the 100 following ones.

    Args
        root (str): Path of MSWC dataset folder where the Metadata.json file and en/ folders should be.
        languages (List[str]): List of languages to use. Not implemented for now, only english will be used.
        visualize (bool): Plots Word Clouds with library wordcloud for a visualization of the FSCIL keywords.

    Returns: base_keywords, evaluation keywords (dictionarries)
    They represent the number of available samples per respective keyword in the original MSWC dataset (although the number is then clipped as detailed above).
    """

    base_keywords, evaluation_keywords = get_command_keywords(root, visualize=visualize)

    if languages is None:
        languages = ['en']

    print(languages)
    if languages  != ['en']:
        print('Other languages than english are not supported yet.')

    base_train_count = dict.fromkeys(base_keywords, 0) #{'train':0, 'val':0, 'test':0})
    base_test_count = dict.fromkeys(base_keywords, 0)
    base_val_count = dict.fromkeys(base_keywords, 0)
    evaluation_count = dict.fromkeys(evaluation_keywords, 0)

    for lang in languages:
        base_train_f = open(os.path.join(root, 'base_train.csv'), 'w')
        base_val_f = open(os.path.join(root, 'base_val.csv'), 'w')
        base_test_f = open(os.path.join(root, 'base_test.csv'), 'w')
        evaluation_f = open(os.path.join(root, 'evaluation.csv'), 'w')
        writer_base_train = csv.writer(base_train_f)
        writer_base_val = csv.writer(base_val_f)
        writer_base_test = csv.writer(base_test_f)
        writer_evaluation = csv.writer(evaluation_f)
        header = ['LINK', 'WORD', 'VALID', 'SPEAKER', 'GENDER']
        writer_base_train.writerow(header)
        writer_base_val.writerow(header)
        writer_base_test.writerow(header)
        writer_evaluation.writerow(header)

        with open(os.path.join(root, lang,  f'{lang}_splits.csv'), 'r') as f:
            for line in f:
                set, path, word, valid, speaker, gender = line.strip().split(',')
                
                # Skip header
                if set == "SET":
                    continue  

                ### Successively assign samples to train (500), validation (100) and test (100) set
                if word in base_keywords:
                    if base_train_count[word] <500:
                        writer_base_train.writerow([path, word, valid, speaker, gender])
                        base_train_count[word] +=1
                    elif base_val_count[word] <100:
                        writer_base_val.writerow([path, word, valid, speaker, gender])
                        base_val_count[word] +=1
                    elif base_test_count[word] <100:
                        writer_base_test.writerow([path, word, valid, speaker, gender])
                        base_test_count[word] +=1

                elif word in evaluation_keywords:
                    if evaluation_count[word] <200:
                        writer_evaluation.writerow([path, word, valid, speaker, gender])
                        evaluation_count[word] +=1


    base_train_f.close()
    base_val_f.close()
    base_test_f.close()
    evaluation_f.close()

    return base_keywords, evaluation_keywords