-
Notifications
You must be signed in to change notification settings - Fork 0
/
pcfg_grammar.py
180 lines (173 loc) · 10.2 KB
/
pcfg_grammar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# coding: utf-8
# imports
from nltk import PCFG
'''
PCFG: grammar with alignment
Defines the a pcfg that can be used with NLTK to generate data
Inputs look like letters, eg A, B, AS, BS...
Transduction outputs look like A1_1, AB_1, ...
'''
alignment_grammar_str = """
S -> A [1.0]
A1 -> 'A1_1' [0.5] | 'A1_2' [0.5]
B1 -> 'B1_1' [0.5] | 'B1_2' [0.5]
C1 -> 'C1_1' [0.5] | 'C1_2' [0.5]
A2 -> 'A2_1' [0.5] | 'A2_2' [0.5]
B2 -> 'B2_1' [0.5] | 'B2_2' [0.5]
C2 -> 'C2_1' [0.5] | 'C2_2' [0.5]
A3 -> 'A3_1' [0.5] | 'A3_2' [0.5]
B3 -> 'B3_1' [0.5] | 'B3_2' [0.5]
C3 -> 'C3_1' [0.5] | 'C3_2' [0.5]
A4 -> 'A4_1' [0.5] | 'A4_2' [0.5]
B4 -> 'B4_1' [0.5] | 'B4_2' [0.5]
C4 -> 'C4_1' [0.5] | 'C4_2' [0.5]
A5 -> 'A5_1' [0.5] | 'A5_2' [0.5]
B5 -> 'B5_1' [0.5] | 'B5_2' [0.5]
C5 -> 'C5_1' [0.5] | 'C5_2' [0.5]
A6 -> 'A6_1' [0.5] | 'A6_2' [0.5]
B6 -> 'B6_1' [0.5] | 'B6_2' [0.5]
C6 -> 'C6_1' [0.5] | 'C6_2' [0.5]
A7 -> 'A7_1' [0.5] | 'A7_2' [0.5]
B7 -> 'B7_1' [0.5] | 'B7_2' [0.5]
C7 -> 'C7_1' [0.5] | 'C7_2' [0.5]
A8 -> 'A8_1' [0.5] | 'A8_2' [0.5]
B8 -> 'B8_1' [0.5] | 'B8_2' [0.5]
C8 -> 'C8_1' [0.5] | 'C8_2' [0.5]
A9 -> 'A9_1' [0.5] | 'A9_2' [0.5]
B9 -> 'B9_1' [0.5] | 'B9_2' [0.5]
C9 -> 'C9_1' [0.5] | 'C9_2' [0.5]
A10 -> 'A10_1' [0.5] | 'A10_2' [0.5]
B10 -> 'B10_1' [0.5] | 'B10_2' [0.5]
C10 -> 'C10_1' [0.5] | 'C10_2' [0.5]
A11 -> 'A11_1' [0.5] | 'A11_2' [0.5]
B11 -> 'B11_1' [0.5] | 'B11_2' [0.5]
C11 -> 'C11_1' [0.5] | 'C11_2' [0.5]
A12 -> 'A12_1' [0.5] | 'A12_2' [0.5]
B12 -> 'B12_1' [0.5] | 'B12_2' [0.5]
C12 -> 'C12_1' [0.5] | 'C12_2' [0.5]
A13 -> 'A13_1' [0.5] | 'A13_2' [0.5]
B13 -> 'B13_1' [0.5] | 'B13_2' [0.5]
C13 -> 'C13_1' [0.5] | 'C13_2' [0.5]
A14 -> 'A14_1' [0.5] | 'A14_2' [0.5]
B14 -> 'B14_1' [0.5] | 'B14_2' [0.5]
C14 -> 'C14_1' [0.5] | 'C14_2' [0.5]
A15 -> 'A15_1' [0.5] | 'A15_2' [0.5]
B15 -> 'B15_1' [0.5] | 'B15_2' [0.5]
C15 -> 'C15_1' [0.5] | 'C15_2' [0.5]
A16 -> 'A16_1' [0.5] | 'A16_2' [0.5]
B16 -> 'B16_1' [0.5] | 'B16_2' [0.5]
C16 -> 'C16_1' [0.5] | 'C16_2' [0.5]
A17 -> 'A17_1' [0.5] | 'A17_2' [0.5]
B17 -> 'B17_1' [0.5] | 'B17_2' [0.5]
C17 -> 'C17_1' [0.5] | 'C17_2' [0.5]
A18 -> 'A18_1' [0.5] | 'A18_2' [0.5]
B18 -> 'B18_1' [0.5] | 'B18_2' [0.5]
C18 -> 'C18_1' [0.5] | 'C18_2' [0.5]
A19 -> 'A19_1' [0.5] | 'A19_2' [0.5]
B19 -> 'B19_1' [0.5] | 'B19_2' [0.5]
C19 -> 'C19_1' [0.5] | 'C19_2' [0.5]
A20 -> 'A20_1' [0.5] | 'A20_2' [0.5]
B20 -> 'B20_1' [0.5] | 'B20_2' [0.5]
C20 -> 'C20_1' [0.5] | 'C20_2' [0.5]
A21 -> 'A21_1' [0.5] | 'A21_2' [0.5]
B21 -> 'B21_1' [0.5] | 'B21_2' [0.5]
C21 -> 'C21_1' [0.5] | 'C21_2' [0.5]
A22 -> 'A22_1' [0.5] | 'A22_2' [0.5]
B22 -> 'B22_1' [0.5] | 'B22_2' [0.5]
C22 -> 'C22_1' [0.5] | 'C22_2' [0.5]
A23 -> 'A23_1' [0.5] | 'A23_2' [0.5]
B23 -> 'B23_1' [0.5] | 'B23_2' [0.5]
C23 -> 'C23_1' [0.5] | 'C23_2' [0.5]
A24 -> 'A24_1' [0.5] | 'A24_2' [0.5]
B24 -> 'B24_1' [0.5] | 'B24_2' [0.5]
C24 -> 'C24_1' [0.5] | 'C24_2' [0.5]
A25 -> 'A25_1' [0.5] | 'A25_2' [0.5]
B25 -> 'B25_1' [0.5] | 'B25_2' [0.5]
C25 -> 'C25_1' [0.5] | 'C25_2' [0.5]
A26 -> 'A26_1' [0.5] | 'A26_2' [0.5]
B26 -> 'B26_1' [0.5] | 'B26_2' [0.5]
C26 -> 'C26_1' [0.5] | 'C26_2' [0.5]
A27 -> 'A27_1' [0.5] | 'A27_2' [0.5]
B27 -> 'B27_1' [0.5] | 'B27_2' [0.5]
C27 -> 'C27_1' [0.5] | 'C27_2' [0.5]
A28 -> 'A28_1' [0.5] | 'A28_2' [0.5]
B28 -> 'B28_1' [0.5] | 'B28_2' [0.5]
C28 -> 'C28_1' [0.5] | 'C28_2' [0.5]
A29 -> 'A29_1' [0.5] | 'A29_2' [0.5]
B29 -> 'B29_1' [0.5] | 'B29_2' [0.5]
C29 -> 'C29_1' [0.5] | 'C29_2' [0.5]
A30 -> 'A30_1' [0.5] | 'A30_2' [0.5]
B30 -> 'B30_1' [0.5] | 'B30_2' [0.5]
C30 -> 'C30_1' [0.5] | 'C30_2' [0.5]
A31 -> 'A31_1' [0.5] | 'A31_2' [0.5]
B31 -> 'B31_1' [0.5] | 'B31_2' [0.5]
C31 -> 'C31_1' [0.5] | 'C31_2' [0.5]
A32 -> 'A32_1' [0.5] | 'A32_2' [0.5]
B32 -> 'B32_1' [0.5] | 'B32_2' [0.5]
C32 -> 'C32_1' [0.5] | 'C32_2' [0.5]
A33 -> 'A33_1' [0.5] | 'A33_2' [0.5]
B33 -> 'B33_1' [0.5] | 'B33_2' [0.5]
C33 -> 'C33_1' [0.5] | 'C33_2' [0.5]
A34 -> 'A34_1' [0.5] | 'A34_2' [0.5]
B34 -> 'B34_1' [0.5] | 'B34_2' [0.5]
C34 -> 'C34_1' [0.5] | 'C34_2' [0.5]
A35 -> 'A35_1' [0.5] | 'A35_2' [0.5]
B35 -> 'B35_1' [0.5] | 'B35_2' [0.5]
C35 -> 'C35_1' [0.5] | 'C35_2' [0.5]
A36 -> 'A36_1' [0.5] | 'A36_2' [0.5]
B36 -> 'B36_1' [0.5] | 'B36_2' [0.5]
C36 -> 'C36_1' [0.5] | 'C36_2' [0.5]
A37 -> 'A37_1' [0.5] | 'A37_2' [0.5]
B37 -> 'B37_1' [0.5] | 'B37_2' [0.5]
C37 -> 'C37_1' [0.5] | 'C37_2' [0.5]
A38 -> 'A38_1' [0.5] | 'A38_2' [0.5]
B38 -> 'B38_1' [0.5] | 'B38_2' [0.5]
C38 -> 'C38_1' [0.5] | 'C38_2' [0.5]
A39 -> 'A39_1' [0.5] | 'A39_2' [0.5]
B39 -> 'B39_1' [0.5] | 'B39_2' [0.5]
C39 -> 'C39_1' [0.5] | 'C39_2' [0.5]
A40 -> 'A40_1' [0.5] | 'A40_2' [0.5]
B40 -> 'B40_1' [0.5] | 'B40_2' [0.5]
C40 -> 'C40_1' [0.5] | 'C40_2' [0.5]
A -> A1 B1 C1 [0.15] | B1 A1 C1 [0.17] | C1 A1 B1 [0.17] | C1 B1 A1 [0.17] | A1 C1 B1 [0.17] | B1 C1 A1 [0.17]
B -> A2 B2 C2 [0.15] | B2 A2 C2 [0.17] | C2 A2 B2 [0.17] | C1 B1 A1 [0.17] | A2 C2 B2 [0.17] | B2 C2 A2 [0.17]
C -> A3 B3 C3 [0.15] | B3 A3 C3 [0.17] | C3 A3 B3 [0.17] | C3 B3 A3 [0.17] | A3 C3 B3 [0.17] | B3 C3 A3 [0.17]
D -> A4 B4 C4 [0.15] | B4 A4 C4 [0.17] | C4 A4 B4 [0.17] | C4 B4 A4 [0.17] | A4 C4 B4 [0.17] | B4 C4 A4 [0.17]
E -> A5 B5 C5 [0.15] | B5 A5 C5 [0.17] | C5 A5 B5 [0.17] | C5 B5 A5 [0.17] | A5 C5 B5 [0.17] | B5 C5 A5 [0.17]
F -> A6 B6 C6 [0.15] | B6 A6 C6 [0.17] | C6 A6 B6 [0.17] | C6 B6 A6 [0.17] | A6 C6 B6 [0.17] | B6 C6 A6 [0.17]
G -> A7 B7 C7 [0.15] | B7 A7 C7 [0.17] | C7 A7 B7 [0.17] | C7 B7 A7 [0.17] | A7 C7 B7 [0.17] | B7 C7 A7 [0.17]
H -> A8 B8 C8 [0.15] | B8 A8 C8 [0.17] | C8 A8 B8 [0.17] | C8 B8 A8 [0.17] | A8 C8 B8 [0.17] | B8 C8 A8 [0.17]
I -> A9 B9 C9 [0.15] | B9 A9 C9 [0.17] | C9 A9 B9 [0.17] | C9 B9 A9 [0.17] | A9 C9 B9 [0.17] | B9 C9 A9 [0.17]
J -> A10 B10 C10 [0.15] | B10 A10 C10 [0.17] | C10 A10 B10 [0.17] | C10 B10 A10 [0.17] | A10 C10 B10 [0.17] | B10 C10 A10 [0.17]
K -> A11 B11 C11 [0.15] | B11 A11 C11 [0.17] | C11 A11 B11 [0.17] | C11 B11 A11 [0.17] | A11 C11 B11 [0.17] | B11 C11 A11 [0.17]
L -> A12 B12 C12 [0.15] | B12 A12 C12 [0.17] | C12 A12 B12 [0.17] | C12 B12 A12 [0.17] | A12 C12 B12 [0.17] | B12 C12 A12 [0.17]
M -> A13 B13 C13 [0.15] | B13 A13 C13 [0.17] | C13 A13 B13 [0.17] | C13 B13 A13 [0.17] | A13 C13 B13 [0.17] | B13 C13 A13 [0.17]
N -> A14 B14 C14 [0.15] | B14 A14 C14 [0.17] | C14 A14 B14 [0.17] | C14 B14 A14 [0.17] | A14 C14 B14 [0.17] | B14 C14 A14 [0.17]
O -> A15 B15 C15 [0.15] | B15 A15 C15 [0.17] | C15 A15 B15 [0.17] | C15 B15 A15 [0.17] | A15 C15 B15 [0.17] | B15 C15 A15 [0.17]
P -> A16 B16 C16 [0.15] | B16 A16 C16 [0.17] | C16 A16 B16 [0.17] | C16 B16 A16 [0.17] | A16 C16 B16 [0.17] | B16 C16 A16 [0.17]
Q -> A17 B17 C17 [0.15] | B17 A17 C17 [0.17] | C17 A17 B17 [0.17] | C17 B17 A17 [0.17] | A17 C17 B17 [0.17] | B17 C17 A17 [0.17]
R -> A18 B18 C18 [0.15] | B18 A18 C18 [0.17] | C18 A18 B18 [0.17] | C18 B18 A18 [0.17] | A18 C18 B18 [0.17] | B18 C18 A18 [0.17]
T -> A19 B19 C19 [0.15] | B19 A19 C19 [0.17] | C19 A19 B19 [0.17] | C19 B19 A19 [0.17] | A19 C19 B19 [0.17] | B19 C19 A19 [0.17]
U -> A20 B20 C20 [0.15] | B20 A20 C20 [0.17] | C20 A20 B20 [0.17] | C20 B20 A20 [0.17] | A20 C20 B20 [0.17] | B20 C20 A20 [0.17]
V -> A21 B21 C21 [0.15] | B21 A21 C21 [0.17] | C21 A21 B21 [0.17] | C21 B21 A21 [0.17] | A21 C21 B21 [0.17] | B21 C21 A21 [0.17]
W -> A22 B22 C22 [0.15] | B22 A22 C22 [0.17] | C22 A22 B22 [0.17] | C22 B22 A22 [0.17] | A22 C22 B22 [0.17] | B22 C22 A22 [0.17]
X -> A23 B23 C23 [0.15] | B23 A23 C23 [0.17] | C23 A23 B23 [0.17] | C23 B23 A23 [0.17] | A23 C23 B23 [0.17] | B23 C23 A23 [0.17]
Y -> A24 B24 C24 [0.15] | B24 A24 C24 [0.17] | C24 A24 B24 [0.17] | C24 B24 A24 [0.17] | A24 C24 B24 [0.17] | B24 C24 A24 [0.17]
Z -> A25 B25 C25 [0.15] | B25 A25 C25 [0.17] | C25 A25 B25 [0.17] | C25 B25 A25 [0.17] | A25 C25 B25 [0.17] | B25 C25 A25 [0.17]
AS -> A26 B26 C26 [0.15] | B26 A26 C26 [0.17] | C26 A26 B26 [0.17] | C26 B26 A26 [0.17] | A26 C26 B26 [0.17] | B26 C26 A26 [0.17]
BS -> A27 B27 C27 [0.15] | B27 A27 C27 [0.17] | C27 A27 B27 [0.17] | C27 B27 A27 [0.17] | A27 C27 B27 [0.17] | B27 C27 A27 [0.17]
CS -> A28 B28 C28 [0.15] | B28 A28 C28 [0.17] | C28 A28 B28 [0.17] | C28 B28 A28 [0.17] | A28 C28 B28 [0.17] | B28 C28 A28 [0.17]
DS -> A29 B29 C29 [0.15] | B29 A29 C29 [0.17] | C29 A29 B29 [0.17] | C29 B29 A29 [0.17] | A29 C29 B29 [0.17] | B29 C29 A29 [0.17]
ES -> A30 B30 C30 [0.15] | B30 A30 C30 [0.17] | C30 A30 B30 [0.17] | C30 B30 A30 [0.17] | A30 C30 B30 [0.17] | B30 C30 A30 [0.17]
FS -> A31 B31 C31 [0.15] | B31 A31 C31 [0.17] | C31 A31 B31 [0.17] | C31 B31 A31 [0.17] | A31 C31 B31 [0.17] | B31 C31 A31 [0.17]
GS -> A32 B32 C32 [0.15] | B32 A32 C32 [0.17] | C32 A32 B32 [0.17] | C32 B32 A32 [0.17] | A32 C32 B32 [0.17] | B32 C32 A32 [0.17]
HS -> A33 B33 C33 [0.15] | B33 A33 C33 [0.17] | C33 A33 B33 [0.17] | C33 B33 A33 [0.17] | A33 C33 B33 [0.17] | B33 C33 A33 [0.17]
IS -> A34 B34 C34 [0.15] | B34 A34 C34 [0.17] | C34 A34 B34 [0.17] | C34 B34 A34 [0.17] | A34 C34 B34 [0.17] | B34 C34 A34 [0.17]
JS -> A35 B35 C35 [0.15] | B35 A35 C35 [0.17] | C35 A35 B35 [0.17] | C35 B35 A35 [0.17] | A35 C35 B35 [0.17] | B35 C35 A35 [0.17]
KS -> A36 B36 C36 [0.15] | B36 A36 C36 [0.17] | C36 A36 B36 [0.17] | C36 B36 A36 [0.17] | A36 C36 B36 [0.17] | B36 C36 A36 [0.17]
LS -> A37 B37 C37 [0.15] | B37 A37 C37 [0.17] | C37 A37 B37 [0.17] | C37 B37 A37 [0.17] | A37 C37 B37 [0.17] | B37 C37 A37 [0.17]
MS -> A38 B38 C38 [0.15] | B38 A38 C38 [0.17] | C38 A38 B38 [0.17] | C38 B38 A38 [0.17] | A38 C38 B38 [0.17] | B38 C38 A38 [0.17]
NS -> A39 B39 C39 [0.15] | B39 A39 C39 [0.17] | C39 A39 B39 [0.17] | C39 B39 A39 [0.17] | A39 C39 B39 [0.17] | B39 C39 A39 [0.17]
OS -> A40 B40 C40 [0.15] | B40 A40 C40 [0.17] | C40 A40 B40 [0.17] | C40 B40 A40 [0.17] | A40 C40 B40 [0.17] | B40 C40 A40 [0.17]
"""
alignment_grammar = PCFG.fromstring(alignment_grammar_str)