-
Notifications
You must be signed in to change notification settings - Fork 21
/
config.yaml
82 lines (64 loc) · 3.32 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
documentation: |
Coraa Portuguese
################
The config performs the following data processing.
1. Downloads and extracts all the data from the "https://huggingface.co/datasets/gabrielrstan/CORAA-v1.1/tree/main"
2. Replaces certain non-supported characters, abbreviations and punctuation marks with equivalent supported versions.
3. Drops any data that contains high/low character occurence.
4. Drops any data that contains symbols not in the supported alphabet.
**Required arguments**.
* **workspace_dir**: specify the workspace folder where all audio files will be stored.
* **data_split**: should be "train", "dev" or "test".
**Output format**.
This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``.
The output manifest contains the following fields:
* **audio_filepath (str)**: relative path to the audio files.
* **text (str)**: transcription, including punctuation ".,?" and capitalization.
* **duration (float)**: audio duration in seconds.
processors_to_run: all
workspace_dir: ???
data_split: ???
final_manifest: ???
processors:
- _target_: sdp.processors.CreateInitialManifestCORAA
raw_data_dir: ${workspace_dir}
data_split: ${data_split}
extract_archive_dir: ${workspace_dir}/extracted
resampled_audio_dir: ${workspace_dir}/extracted/16k
already_downloaded: false
already_extracted: false
output_manifest_file: ${workspace_dir}/${data_split}_manifest0.json
- _target_: sdp.processors.SubRegex
regex_params_list:
- {"pattern": "(Aplausos)", "repl": " "}
- {"pattern": "(Risos)", "repl": " "}
- {"pattern": '[\-\‐\‑\–\—\―\"]', "repl": " "}
- {"pattern": "'", "repl": " "}
- {"pattern": '[\$\&\¡\(\)]', "repl": " "}
- {"pattern": '[\«\°\´\·\»]', "repl": " "}
- {"pattern": '[\«\°\´\·\»]', "repl": " "}
- {"pattern": '[\‘\’\“\”\„]', "repl": " "}
- {"pattern": '[\:\;\`\ʻ]', "repl": " "}
- {"pattern": "!", "repl": "."}
- {"pattern": "…\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end
- {"pattern": "\\.{2,20}\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end
# remove remaining repeated periods since most of the time they are unnecessary in this data
- {"pattern": "\\.{2,20}", "repl": " "}
- {"pattern": " ([Pp])rofa ", "repl" : ' \1rofessora '}
- {"pattern": " ([Ss])ra.", "repl" : ' \1enhora'}
- {"pattern": " ([Ss])rta.", "repl": '\1enhorita'}
- {"pattern": " ([Ss])r.", 'repl': '\1enhor' }
- {"pattern": " ([Dd])r ", "repl" : ' \1octor '}
- {"pattern": " ([Dd])r.", "repl" : ' \1octor '}
- {"pattern": " ([Dd])ra ", "repl" : ' \1octora '}
- {"pattern": " um km ", "repl" : " um quilômetro "}
- {"pattern": " km ", "repl" : " quilômetros "}
- _target_: sdp.processors.DropHighLowDuration
high_duration_threshold: 20
low_duration_threshold: 0.5
- _target_: sdp.processors.DropHighLowCharrate
high_charrate_threshold: 21
low_charrate_threshold: 1
- _target_: sdp.processors.DropNonAlphabet
output_manifest_file: ${final_manifest}
alphabet: " ÁÃÀÂÇÉÊÍÕÓÔÚÜáãàâçéêíõóôúüABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,.?"