In [None]:
from datasets import load_dataset, Audio
import soundfile as sf
import torch
import torchaudio
import librosa
import os
import csv
import re

In [None]:
DATA_DIR = "/YOUR_PATH_TO_DATA/KsponSpeech/"
TRAIN_DIR = "KsponSpeech_data/"

In [None]:
# Thanks to https://github.com/sooftware/kospeech and https://github.com/sooftware/openspeech 
# @ARTICLE{2021-kospeech,
#   author    = {Kim, Soohwan and Bae, Seyoung and Won, Cheolhwang},
#   title     = {KoSpeech: Open-Source Toolkit for End-to-End Korean Speech Recognition},
#   url       = {https://www.sciencedirect.com/science/article/pii/S2665963821000026},
#   month     = {February},
#   year      = {2021},
#   publisher = {ELSEVIER},
#   journal   = {SIMPAC},
#   pages     = {Volume 7, 100054}
# }

def bracket_filter(sentence, mode='phonetic'):
    new_sentence = str()

    if mode == 'phonetic':
        flag = False

        for ch in sentence:
            if ch == '(' and flag is False:
                flag = True
                continue
            if ch == '(' and flag is True:
                flag = False
                continue
            if ch != ')' and flag is False:
                new_sentence += ch

    elif mode == 'spelling':
        flag = True

        for ch in sentence:
            if ch == '(':
                continue
            if ch == ')':
                if flag is True:
                    flag = False
                    continue
                else:
                    flag = True
                    continue
            if ch != ')' and flag is True:
                new_sentence += ch

    else:
        raise ValueError("Unsupported mode : {0}".format(mode))

    return new_sentence


def special_filter(sentence, mode='phonetic', replace=None):
    SENTENCE_MARK = ['?', '!', '.']
    NOISE = ['o', 'n', 'u', 'b', 'l']
    EXCEPT = ['/', '+', '*', '-', '@', '$', '^', '&', '[', ']', '=', ':', ';', ',', '?', '!', '.']

    new_sentence = str()
    for idx, ch in enumerate(sentence):
        if ch not in SENTENCE_MARK:
            if idx + 1 < len(sentence) and ch in NOISE and sentence[idx + 1] == '/':
                continue
        
        if ch == '#':
            new_sentence += '샾'

        elif ch == '%':
            if mode == 'phonetic':
                new_sentence += replace
            elif mode == 'spelling':
                new_sentence += '%'

        elif ch not in EXCEPT:
            new_sentence += ch

    pattern = re.compile(r'\s\s+')
    new_sentence = re.sub(pattern, ' ', new_sentence.strip())
    
    for i in SENTENCE_MARK:
        new_sentence = new_sentence.replace(i, '')
    
    return new_sentence


def sentence_filter(raw_sentence, mode, replace=None):
    return special_filter(bracket_filter(raw_sentence, mode), mode, replace)

In [None]:
DATA_DIR = "/YOUR_PATH_TO_DATA/KsponSpeech/"
SCRIPT_DIR = "KsponSpeech_scripts/"
AUDIO_DIR = "KsponSpeech_data/"

percent_files = {
        '087797': '퍼센트',
        '215401': '퍼센트',
        '284574': '퍼센트',
        '397184': '퍼센트',
        '501006': '프로',
        '502173': '프로',
        '542363': '프로',
        '581483': '퍼센트'
    }

trn_text_train = open(DATA_DIR + SCRIPT_DIR + 'train.trn', 'r')
trn_text_dev = open(DATA_DIR + SCRIPT_DIR + 'dev.trn', 'r')

mode='phonetic'

with open('aihub_ksponSpeech_train.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["audio", "transcript"])
    for line in trn_text_train:
        txt = line.strip()
        txt = txt.replace('.pcm', '.wav')
        audio = txt.split(' :: ')[0]
        transcript = txt.split(' :: ')[1]     
                
        if audio[44:50] in percent_files.keys():
            transcript = sentence_filter(transcript, mode, percent_files[audio[44:50]])
        else:
            transcript = sentence_filter(transcript, mode=mode)
            
        writer.writerow([DATA_DIR + AUDIO_DIR + audio, transcript])

with open('aihub_ksponSpeech_dev.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["audio", "transcript"])
    for line in trn_text_dev:
        txt = line.strip()
        txt = txt.replace('.pcm', '.wav')
        audio = txt.split(' :: ')[0]
        transcript = txt.split(' :: ')[1]     
                
        if audio[44:50] in percent_files.keys():
            transcript = sentence_filter(transcript, mode, percent_files[audio[44:50]])
        else:
            transcript = sentence_filter(transcript, mode=mode)
                        
        writer.writerow([DATA_DIR + AUDIO_DIR + audio, transcript])

In [None]:
DATA_DIR = "/YOUR_PATH_TO/KsponSpeech/"
SCRIPT_DIR = "KsponSpeech_scripts/"
FILE_DIR  = "eval_clean.trn"
AUDIO_DIR = "KsponSpeech_eval/eval_clean/"
mode='phonetic'

In [None]:
trn_text = open(DATA_DIR + SCRIPT_DIR + FILE_DIR, 'r')

with open('aihub_ksponSpeech_eval_clean.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["audio", "transcript"])
    for line in trn_text:
        txt = line.strip()
        txt = txt.replace('.pcm', '.wav')
                
        audio = txt.split(' :: ')[0]
        transcript = txt.split(' :: ')[1]     
                
        if audio[44:50] in percent_files.keys():
            transcript = sentence_filter(transcript, mode, percent_files[audio[44:50]])
        else:
            transcript = sentence_filter(transcript, mode=mode)
                        
        writer.writerow([DATA_DIR + audio, transcript])

In [None]:
FILE_DIR  = "eval_other.trn"

In [None]:
trn_text = open(DATA_DIR + SCRIPT_DIR + FILE_DIR, 'r')

with open('aihub_ksponSpeech_eval_other.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["audio", "transcript"])
    for line in trn_text:
        txt = line.strip()
        txt = txt.replace('.pcm', '.wav')
        audio = txt.split(' :: ')[0]
        transcript = txt.split(' :: ')[1]     
                
        if audio[44:50] in percent_files.keys():
            transcript = sentence_filter(transcript, mode, percent_files[audio[44:50]])
        else:
            transcript = sentence_filter(transcript, mode=mode)
                        
        writer.writerow([DATA_DIR + audio, transcript])

In [None]:
data_files = {"train": "aihub_ksponSpeech_train.csv", "dev": "aihub_ksponSpeech_dev.csv", "test_clean": "aihub_ksponSpeech_eval_clean.csv", "test_other":"aihub_ksponSpeech_eval_other.csv"}
dataset = load_dataset("csv", data_files=data_files)

In [None]:
dataset['train'] = dataset['train'].cast_column('audio', Audio())
dataset['dev'] = dataset['dev'].cast_column('audio', Audio())
dataset['test_clean'] = dataset['test_clean'].cast_column('audio', Audio())
dataset['test_other'] = dataset['test_other'].cast_column('audio', Audio())

In [None]:
dataset.save_to_disk("/YOUR_SAVE_PATH_TO/ksponSpeech")