-
Notifications
You must be signed in to change notification settings - Fork 18
/
qed_lm_langid.yaml
168 lines (157 loc) · 4.83 KB
/
qed_lm_langid.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# Example configuration for language identification with LMClassifierFilter.
# Also shows how to to loop multiple language pairs in a single step.
common:
output_directory: qed_lm_langid
steps:
# Read sv-{de,en,fr} data from the QED corpus that contains noisy
# data. Around 600MB will be downloaded.
- type: opus_read
parameters:
corpus_name: QED
source_language: sv
target_language: !var lang
release: latest
preprocessing: raw
src_output: !varstr "sv-{lang}.sv.raw.gz"
tgt_output: !varstr "sv-{lang}.{lang}.raw.gz"
suppress_prompts: true
variables:
lang: [de, en, fr]
# Preprocess segments with WhitespaceNormalizer
- type: preprocess
parameters:
inputs:
- !varstr "sv-{lang}.sv.raw.gz"
- !varstr "sv-{lang}.{lang}.raw.gz"
outputs:
- !varstr "sv-{lang}.sv.preprocessed.gz"
- !varstr "sv-{lang}.{lang}.preprocessed.gz"
preprocessors:
- WhitespaceNormalizer: {}
variables:
lang: [de, en, fr]
# Initial filtering for QED data (for training the LMs). The cld2
# language detection filter is fast, but not very robust and often
# fails to identify short segments.
- type: filter
parameters:
inputs:
- !varstr "sv-{lang}.sv.preprocessed.gz"
- !varstr "sv-{lang}.{lang}.preprocessed.gz"
outputs:
- !varstr "sv-{lang}.sv.training.gz"
- !varstr "sv-{lang}.{lang}.training.gz"
filters:
- LengthFilter:
unit: word
min_length: 3
max_length: 100
- LengthRatioFilter:
unit: word
threshold: 3
- LanguageIDFilter:
id_method: cld2
languages: [sv, !var lang]
thresholds: [0, 0]
variables:
lang: [de, en, fr]
# Train 4-gram models from the pre-filtered data.
# For sv, use data from the sv-en pair.
- type: train_ngram
parameters:
data: !var data
parameters:
norder: 4
dscale: 0
absolute: true
model: !varstr "{lang}.arpa.gz"
variables:
lang: [de, en, fr, sv]
data: [sv-de.de.training.gz, sv-en.en.training.gz, sv-fr.fr.training.gz, sv-en.sv.training.gz]
# Train a background unigram model from unfiltered pooled QED data.
# When the language-specific models are interpolated with this, it
# ensures that the same set of characters can be predicted by all
# models.
- type: concatenate
parameters:
inputs:
- sv-de.de.preprocessed.gz
- sv-en.en.preprocessed.gz
- sv-fr.fr.preprocessed.gz
- sv-fr.sv.preprocessed.gz
output: pooled.txt.gz
- type: train_ngram
parameters:
data: pooled.txt.gz
parameters:
norder: 1
dscale: 0
absolute: true
model: pooled.arpa.gz
# Some initial filtering
- type: filter
parameters:
inputs:
- !varstr "sv-{lang}.sv.preprocessed.gz"
- !varstr "sv-{lang}.{lang}.preprocessed.gz"
outputs:
- !varstr "sv-{lang}.sv.filtered-initial.gz"
- !varstr "sv-{lang}.{lang}.filtered-initial.gz"
filters:
- LengthFilter:
unit: char
min_length: 2
max_length: 1000
- LengthRatioFilter:
unit: char
threshold: 5
variables:
lang: [de, en, fr]
# Filter data with language identification LMs. Including the
# background unigram model with its own label allows the classifier to
# consider that the segment is outside of the four languages. Using
# relative_score means that the label with the highest LM likelihood
# always gets score one, which is also here the filtering threshold.
# Without relative_score, the scores would be normalized label
# probabilities that sum up to one.
- type: filter
parameters:
inputs:
- !varstr "sv-{lang}.sv.filtered-initial.gz"
- !varstr "sv-{lang}.{lang}.filtered-initial.gz"
outputs:
- !varstr "sv-{lang}.sv.filtered-final.gz"
- !varstr "sv-{lang}.{lang}.filtered-final.gz"
filters:
- LMClassifierFilter:
lm_params: &lm_params
de: {filename: de.arpa.gz, interpolate: [[pooled.arpa.gz, 0.01]]}
en: {filename: en.arpa.gz, interpolate: [[pooled.arpa.gz, 0.01]]}
fr: {filename: fr.arpa.gz, interpolate: [[pooled.arpa.gz, 0.01]]}
sv: {filename: sv.arpa.gz, interpolate: [[pooled.arpa.gz, 0.01]]}
other: {filename: pooled.arpa.gz}
labels: [sv, !var lang]
relative_score: true
thresholds: [1.0, 1.0]
variables:
lang: [de, en, fr]
# Use filterfalse for checking the segments that the filter removes.
# Here it is applied to each monolingual file.
- type: filter
parameters:
inputs: [!var inputfile]
outputs: [!varstr "{lang}.removed.gz"]
filters:
- LMClassifierFilter:
lm_params: *lm_params
labels: [!var lang]
relative_score: true
thresholds: [1.0]
filterfalse: true
variables:
lang: [de, en, fr, sv]
inputfile:
- sv-de.de.filtered-initial.gz
- sv-en.en.filtered-initial.gz
- sv-fr.fr.filtered-initial.gz
- sv-en.sv.filtered-initial.gz