In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

In [2]:
DATA_DIR = Path("/Users/gbiamby/Downloads/utterances_600_8khz")
assert DATA_DIR.exists()

In [3]:
df = pd.read_csv(Path("./utterances_600.txt"), header=0, sep="\t")
df.reset_index(inplace=True)
df.columns = ["row_id", "text"]
df["filename"] = df["row_id"].apply(lambda i: f"{i}.wav" )
# df["filename"] = df["row_id"].apply(lambda i: f"{i}".zfill(4) + ".wav")
print("index: ", df.index, "cols: ", df.columns)
display(df)

index:  RangeIndex(start=0, stop=600, step=1) cols:  Index(['row_id', 'text', 'filename'], dtype='object')


Unnamed: 0,row_id,text,filename
0,0,"The original plan, which Lord knows didn't mea...",0.wav
1,1,"If nothing went too badly wrong, it would take...",1.wav
2,2,I had seen Clark in so many different situatio...,2.wav
3,3,But there is nothing like sixteen days on the ...,3.wav
4,4,On the Atlantic crossing Hyperion would carry ...,4.wav
...,...,...,...
595,595,I knew my next company would be a start-up.,595.wav
596,596,And I knew that anything smaller than Netscape...,596.wav
597,597,"That day in December 1995, after Jim Clark sho...",597.wav
598,598,He asked Kittu to take a walk with him.,598.wav


## Resample to 16khz


In [None]:
import os

input_dir = (DATA_DIR / "wave").resolve()

final_path = Path("./data_giscard").resolve()
if not final_path.exists():
    final_path.mkdir(exist_ok=True, parents=True)


def resample_wav(filename: str, input_dir: Path, output_dir: Path):
    source_file = input_dir / filename
    output_file = output_dir / filename

    output_str = f"ffmpeg -i {source_file} -ac 1 -ar 16000 {output_file}"
    os.system(output_str)
    print(output_str)

for filename in df.filename.values:
    resample_wav(filename, input_dir, final_path)

In [5]:
df_giscard = df[["filename", "text"]].copy(deep=True)
df_giscard["filename"] = df_giscard.filename.apply(lambda f: str(Path("data_giscard/") / f))
df_giscard.columns = ["file", "text"]
df_giscard

Unnamed: 0,file,text
0,data_giscard/0.wav,"The original plan, which Lord knows didn't mea..."
1,data_giscard/1.wav,"If nothing went too badly wrong, it would take..."
2,data_giscard/2.wav,I had seen Clark in so many different situatio...
3,data_giscard/3.wav,But there is nothing like sixteen days on the ...
4,data_giscard/4.wav,On the Atlantic crossing Hyperion would carry ...
...,...,...
595,data_giscard/595.wav,I knew my next company would be a start-up.
596,data_giscard/596.wav,And I knew that anything smaller than Netscape...
597,data_giscard/597.wav,"That day in December 1995, after Jim Clark sho..."
598,data_giscard/598.wav,He asked Kittu to take a walk with him.


### Remove entries that don't have a matching wav file

In [6]:
def file_exists(filename: str):
    file_path = Path(filename)
    return file_path.exists()

df_giscard["file_exists"] = df_giscard.file.apply(lambda f: file_exists(f))
df_giscard = df_giscard[df_giscard.file_exists].copy(deep=True)

## Create train/val/test splits

In [7]:
df_train = df_giscard.sample(frac=0.8)
df_valid = df_giscard[~df_giscard.file.isin(df_train.file)].sample(frac=0.5)
df_test = df_giscard[~((df_giscard.file.isin(df_train.file))|(df_giscard.file.isin(df_valid.file)))]
print(len(df_train), len(df_valid), len(df_test))
print("Total utterances: ", len(df_giscard))
# Make sure the train/valid/test test are not overlapping, and don't exclude any of the utterances:
print(len(set(df_train.file.values) | set(df_valid.file.values) | set(df_test.file.values)))

479 60 60
Total utterances:  599
599


### Write train/val/test CSVs

In [8]:
df_train.to_csv(final_path / "train.csv", header=True, index=False)
df_valid.to_csv(final_path / "valid.csv", header=True, index=False)
df_test.to_csv(final_path / "test.csv", header=True, index=False)