/
config.yaml
107 lines (84 loc) · 3.83 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
documentation: |
MCV Italian
###########
This config was originally designed for the
`Mozilla Common Voice (MCV) <https://commonvoice.mozilla.org/>`_ dataset
12.0 release, but should work for any subsequent releases as well.
It performs the following data processing.
1. Extracts and converts all data to the NeMo format.
2. Replaces certain non-supported characters and punctuation marks with equivalent supported versions.
3. Drops any data that contains symbols not in the supported alphabet.
4. Drops a few manually specified audio files that were found to contain transcription errors.
**Required arguments**.
* **workspace_dir**: specify the workspace folder where all audio files will be stored.
You need to manually place the downloaded MCV Italian data inside
``<workspace dir>/raw_data/`` subfolder.
* **data_split**: should be "train", "dev" or "test".
Note that you can customize any part of this config either directly or from command-line.
Here are some common customizations to consider:
* **remove_pc**: set to True if P&C is not needed. Defaults to False.
**Output format**.
This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``.
The output manifest contains the following fields:
* **audio_filepath (str)**: relative path to the audio files.
* **text (str)**: transcription, including punctuation ".,?" and capitalization.
* **duration (float)**: audio duration in seconds.
processors_to_run: all
data_split: ???
workspace_dir: ???
final_manifest: ${workspace_dir}/${data_split}_manifest.json
remove_pc: False
processors:
- _target_: sdp.processors.CreateInitialManifestMCV
output_manifest_file: ${workspace_dir}/${data_split}_manifest0.json
language_id: it
extract_archive_dir: ${workspace_dir}/raw_data
resampled_audio_dir: ${workspace_dir}/${data_split}/audio/
data_split: ${data_split}
raw_data_dir: ${workspace_dir}/raw_data
- _target_: sdp.processors.SubRegex
regex_params_list:
- {"pattern": "!", "repl": "."}
- {"pattern": "…", "repl": "."}
- {"pattern": "’", "repl": "'"}
- {"pattern": '[\":\(\)“”;]', "repl": ''}
- {"pattern": "[-/]", "repl": " "}
# note that we exclude î and ó - according to wikipedia they are very
# rarely used in modern italian. So it's safer to replace them, as they
# often represent other languages (e.g., french or spanish, most often
# in names), rather than actual italian
- {"pattern": "î", "repl": "i"}
- {"pattern": "ó", "repl": "o"}
- {"pattern": "Î", "repl": "I"}
- {"pattern": "Ó", "repl": "O"}
test_cases:
- {input: {text: "Wow!"}, output: {text: "Wow."}}
- _target_: sdp.processors.DropNonAlphabet
alphabet: ".,? 'abcdefghijklmnopqrstuvwxyzàèéìíòùúABCDEFGHIJKLMNOPQRSTUVWXYZÀÈÉÌÍÒÙÚ"
test_cases:
- {input: {text: "test тест 测试"}, output: null}
- {input: {text: "test"}, output: {text: "test"}}
- _target_: sdp.processors.DropIfRegexMatch
regex_patterns: [
# transcription errors
"common_voice_it_17553281.wav",
"common_voice_it_19976820.wav",
"common_voice_it_17553352.wav",
]
text_key: audio_filepath
# ------------------------ if P&C is not needed ------------------------
- _target_: sdp.processors.SubMakeLowercase
should_run: ${remove_pc}
- _target_: sdp.processors.SubRegex
should_run: ${remove_pc}
regex_params_list:
- {"pattern": '[\?\.,]', "repl": ""}
# ----------------------------------------------------------------------
- _target_: sdp.processors.ChangeToRelativePath
base_dir: ${workspace_dir}
- _target_: sdp.processors.KeepOnlySpecifiedFields
output_manifest_file: ${final_manifest}
fields_to_keep:
- audio_filepath
- text
- duration