In [23]:
import os
from random import randint, uniform
import re
import numpy as np
import wave
import contextlib


## Calculating recordings lengths

In [24]:

total = 0
corrupted=0
files=0
for directory in os.listdir("./sections/"):
    if os.path.isdir("./sections/"+directory):
        for file in os.listdir("./sections/"+directory):
            if file.endswith(".wav"):
                files+=1
                fname = "./sections/"+directory+"/"+file
                try:
                    with contextlib.closing(wave.open(fname,'r')) as f:
                        frames = f.getnframes()
                        rate = f.getframerate()
                        duration = frames / float(rate)
                        total+=duration
                except Exception as e:
                    corrupted+=1

print("We have a total of ",total," seconds",", ",total/(60*60)," hours")

We have a total of  8069.399249999986  seconds ,  2.2414997916666626  hours


In [25]:
print('This is the whole files:',files)
print('The corrupted ones are:' , corrupted)

This is the whole files: 1149
The corrupted ones are: 88


# Organizing data

### spliting records to train, val and test sets

In [26]:
!mkdir data
!mkdir data/records

In [27]:
# Extracting all the non-corrupted files
wav_files = []
all_linkers  = dict()
for directory in os.listdir("./sections/"): # parent directory of the recordings, it should contain folders that contain wav,json and txt files
    if os.path.isdir("./sections/"+directory):
        to_remove = []
        linker = []

        for file in os.listdir("./sections/"+directory):
            if file.endswith(".txt"):
                linker_data = open("./sections/"+directory+"/"+file).readlines()
                
            if file.endswith(".wav"):
                fname = "./sections/"+directory+"/"+file
                try:
                    with contextlib.closing(wave.open(fname,'r')) as f:
                        frames = f.getnframes()
                        rate = f.getframerate()
                        duration = frames / float(rate)
                        wav_files.append(fname)
                except Exception as e:
                    to_remove.append(file)
        for file in to_remove:
            i=0
            while i<len(linker_data):
                if file in linker_data[i]:
                    linker_data.pop(i)
                i+=1
        linker.extend(linker_data)
        all_linkers[directory] =linker
        

            

In [28]:
all_linkers.keys()

dict_keys(['section_14', 'section_6', 'section_7', 'section_17', 'section_11', 'section_12', 'section_5', 'section_1', 'section_18', 'section_9', 'section_3', 'section_15', 'section_4', 'section_10', 'section_8', 'section_16', 'section_2', 'section_13'])

In [29]:
print("After removing the corrupted files, we have {} .wav files left. " . format(len(wav_files)))

After removing the corrupted files, we have 1061 .wav files left. 


In [30]:
to_copy = " ".join(wav_files)
!cp -t data/records/ {to_copy}

cp: will not overwrite just-created 'data/records/200704-223044_pcm_65e_elicit_13.wav' with './sections/section_18/200704-223044_pcm_65e_elicit_13.wav'
cp: will not overwrite just-created 'data/records/200704-223044_pcm_65e_elicit_33.wav' with './sections/section_18/200704-223044_pcm_65e_elicit_33.wav'
cp: will not overwrite just-created 'data/records/200704-223044_pcm_65e_elicit_17.wav' with './sections/section_18/200704-223044_pcm_65e_elicit_17.wav'
cp: will not overwrite just-created 'data/records/200704-223044_pcm_65e_elicit_20.wav' with './sections/section_18/200704-223044_pcm_65e_elicit_20.wav'
cp: will not overwrite just-created 'data/records/200704-223044_pcm_65e_elicit_46.wav' with './sections/section_18/200704-223044_pcm_65e_elicit_46.wav'
cp: will not overwrite just-created 'data/records/200704-223044_pcm_65e_elicit_56.wav' with './sections/section_18/200704-223044_pcm_65e_elicit_56.wav'
cp: will not overwrite just-created 'data/records/200704-223044_pcm_65e_elicit_37.

cp: will not overwrite just-created 'data/records/200703-124427_pcm_65e_elicit_46.wav' with './sections/section_13/200703-124427_pcm_65e_elicit_46.wav'
cp: will not overwrite just-created 'data/records/200703-124427_pcm_65e_elicit_3.wav' with './sections/section_13/200703-124427_pcm_65e_elicit_3.wav'
cp: will not overwrite just-created 'data/records/200703-124427_pcm_65e_elicit_43.wav' with './sections/section_13/200703-124427_pcm_65e_elicit_43.wav'
cp: will not overwrite just-created 'data/records/200703-124427_pcm_65e_elicit_5.wav' with './sections/section_13/200703-124427_pcm_65e_elicit_5.wav'
cp: will not overwrite just-created 'data/records/200703-124427_pcm_65e_elicit_56.wav' with './sections/section_13/200703-124427_pcm_65e_elicit_56.wav'
cp: will not overwrite just-created 'data/records/200703-124427_pcm_65e_elicit_71.wav' with './sections/section_13/200703-124427_pcm_65e_elicit_71.wav'
cp: will not overwrite just-created 'data/records/200703-124427_pcm_65e_elicit_66.wav'

In [31]:
for section,linker in all_linkers.items():
    for i,link in enumerate(linker):
        line = link.split(";")[0].split("(")[1].split(")")[0].strip()
        wav = link.split(";")[1].strip().split("/")[-1]
        linker[i] = wav+":"+line


In [32]:
out_linker = open("./data/linker.txt","w")
out_linker.write("\n".join(linker))
out_linker.close()

In [33]:
!mkdir ./data/records/train
!mkdir ./data/records/test
!mkdir ./data/records/val


In [34]:
import numpy as np
np.random.seed(0)
indices = np.random.permutation(len(wav_files))

test_idx = indices[:len(indices)//2]
validation_portion = int(len(indices)//2 * 0.2)
train_idx = indices[len(indices)//2:-validation_portion]
valid_idx = indices[-validation_portion:]

In [35]:
train_set_files = [wav_files[i].split("/")[-1] for i in train_idx]
valid_set_files = [wav_files[i].split("/")[-1] for i in valid_idx]
test_set_files = [wav_files[i].split("/")[-1] for i in test_idx]

In [36]:
to_copy_train = "./data/records/" + " ./data/records/".join(train_set_files)
to_copy_valid = "./data/records/" + " ./data/records/".join(valid_set_files)
to_copy_test = "./data/records/" + " ./data/records/".join(test_set_files)

In [37]:
!mv -t data/records/train/ {to_copy_train}
!mv -t data/records/val/ {to_copy_valid}
!mv -t data/records/test/ {to_copy_test}

mv: cannot stat './data/records/200704-223044_pcm_65e_elicit_37.wav': No such file or directory
mv: cannot stat './data/records/200703-124427_pcm_65e_elicit_9.wav': No such file or directory
mv: cannot stat './data/records/200703-124427_pcm_65e_elicit_69.wav': No such file or directory
mv: cannot stat './data/records/200704-223044_pcm_65e_elicit_20.wav': No such file or directory
mv: cannot stat './data/records/200704-223044_pcm_65e_elicit_22.wav': No such file or directory
mv: cannot stat './data/records/200704-223044_pcm_65e_elicit_14.wav': No such file or directory
mv: cannot stat './data/records/200703-124427_pcm_65e_elicit_71.wav': No such file or directory
mv: cannot stat './data/records/200704-223044_pcm_65e_elicit_34.wav': No such file or directory
mv: cannot stat './data/records/200704-223044_pcm_65e_elicit_26.wav': No such file or directory
mv: cannot stat './data/records/200703-124427_pcm_65e_elicit_52.wav': No such file or directory
mv: cannot stat './data/records/200703-12

### making of chars.txt file

In [38]:
import re
chars = {" ":1,"ε":0}
text_data = []
char_idx = 2
for section_id,linker in all_linkers.items():
    section_id = re.sub("_","",section_id)+".txt"
    text_file = open("./text_files/"+section_id,"r").readlines() # Modify this to the directory of your txt file that you recorded with
    
    for link in linker:
        file,idx = link.split(":")[0],int(link.split(":")[1].split(" ")[1])-1
        line = text_file[idx]
        line = line.split("##")[0].strip()
        #line = re.sub("[\[\]|٪%«»_ـ]","",line) # Clean unnecessary characters from the data, this is for arabic
        line = re.sub("[—\u200b]","",line) # Clean unnecessary characters from the data, this is for arabic
        text_data.append((line,file.split(".")[0]))
        char_set = set(line)
        for c in char_set:
            if c not in chars:
                chars[c]=char_idx
                char_idx+=1
            

FileNotFoundError: [Errno 2] No such file or directory: './text_files/section15.txt'

In [39]:
raw_text = "\n".join([wav+":"+line for line,wav in text_data])
with open("data/raw_text_file.txt","w") as f:
    f.write(raw_text)


In [40]:
indices_text = []
for line,wav in text_data:
    line = list(line)
    indicies = []
    for c in line:
        indicies.append(str(chars[c]))
    indices_text.append(wav+" "+" ".join(indicies))


In [41]:
len(chars)

72

In [42]:
chars

{' ': 1,
 'ε': 0,
 'r': 2,
 'k': 3,
 'x': 4,
 '.': 5,
 'o': 6,
 's': 7,
 'u': 8,
 'w': 9,
 'i': 10,
 'l': 11,
 'g': 12,
 'a': 13,
 'n': 14,
 'y': 15,
 'e': 16,
 'h': 17,
 'J': 18,
 't': 19,
 'T': 20,
 'b': 21,
 'p': 22,
 'd': 23,
 'E': 24,
 'v': 25,
 'm': 26,
 'c': 27,
 'B': 28,
 'f': 29,
 'N': 30,
 'M': 31,
 'S': 32,
 'L': 33,
 'G': 34,
 'I': 35,
 'W': 36,
 '“': 37,
 'R': 38,
 'z': 39,
 'F': 40,
 'P': 41,
 'D': 42,
 'q': 43,
 'C': 44,
 'j': 45,
 '-': 46,
 'H': 47,
 'Y': 48,
 'O': 49,
 'V': 50,
 'A': 51,
 ',': 52,
 '”': 53,
 '?': 54,
 'K': 55,
 ':': 56,
 '!': 57,
 '’': 58,
 '‘': 59,
 '1': 60,
 '6': 61,
 '2': 62,
 ')': 63,
 '(': 64,
 'U': 65,
 '0': 66,
 '3': 67,
 '4': 68,
 '7': 69,
 '5': 70,
 '9': 71}

In [22]:
indicies_text = "\n".join(indices_text)
with open("data/chars.txt","w") as f:
    f.write(indicies_text)

In [23]:
with open("data/charset.txt","w") as js:
    js.write(str(chars))

In [24]:
with open("data/charset.txt") as js:
    charset = eval(js.read())

In [25]:
charset

{' ': 1,
 'ε': 0,
 'g': 2,
 'y': 3,
 't': 4,
 's': 5,
 'a': 6,
 'u': 7,
 'n': 8,
 'b': 9,
 'e': 10,
 'T': 11,
 'i': 12,
 'x': 13,
 'w': 14,
 'p': 15,
 '.': 16,
 'l': 17,
 'h': 18,
 'k': 19,
 'o': 20,
 'r': 21,
 'J': 22,
 'm': 23,
 'd': 24,
 'E': 25,
 'v': 26,
 'c': 27,
 'f': 28,
 'B': 29,
 'N': 30,
 'M': 31,
 'S': 32,
 'L': 33,
 'G': 34,
 'I': 35,
 'W': 36,
 '“': 37,
 'R': 38,
 'z': 39,
 'F': 40,
 'P': 41,
 'D': 42,
 'q': 43,
 'C': 44,
 'j': 45,
 '-': 46,
 'Y': 47,
 'H': 48,
 'O': 49,
 'V': 50,
 'A': 51,
 'U': 52,
 'K': 53,
 '0': 54,
 '3': 55,
 '1': 56,
 '4': 57,
 ',': 58,
 '2': 59,
 '7': 60,
 '5': 61,
 '9': 62}

In [2]:
file  = open("./data_speech/chars.txt").readlines()
for line in file:
    if len(line.split())<2:
        print(line)

200702-204305_pcm_65e_elicit_66 

