forked from kennytm/EcaFretni
-
Notifications
You must be signed in to change notification settings - Fork 0
/
balanced_substring.py
129 lines (98 loc) · 3.34 KB
/
balanced_substring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#
# balanced_substring.py ... Parse a balanced substring
# Copyright (C) 2010 KennyTM~ <kennytm@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re
def balancedSubstring(string, index=0):
'''
Skip a balanced substring from specified index, and return the next string
index.
This function implements a simplistic stack-based parenthesis parser. It can
consume a substring forming one balanced group of parenthesis, or a quoted
string
>>> '(foo)bar'[:balancedSubstring('(foo)bar')]
'(foo)'
>>> 'foo(bar)'[:balancedSubstring('foo(bar)')]
'f'
>>> '"foo"bar'[:balancedSubstring('"foo"bar')]
'"foo"'
A balanced substring means one of these:
* a character
* a string enclosed between a matching pair of parenthesis: ``(...)``,
``[...]`` and ``{...}``
* a quoted string: ``"..."``, ``'...'``, which can recognize the C-style
escape character e.g. ``'o\\'clock'``.
.. note::
This module is not designed for validation. The 3 different kinds of
parenthesis are not distinguished. That means ``"[foo)"`` will be
considered as balanced.
The optional parameter *index* can be used to tokenize the string::
>>> balancedSubstring('(a)(bbb)c')
3
>>> balancedSubstring('(a)(bbb)c', index=3)
8
A number larger than the length of *string* will be returned on unbalanced
paranthesis.
'''
level = 0
quote_mode = ''
strlen = len(string)
while strlen > index:
c = string[index]
if c == '\\' and quote_mode:
index += 1
elif c == quote_mode:
quote_mode = ''
if level <= 0:
break
elif not quote_mode:
if c in "([{":
level += 1
elif c in ")]}":
level -= 1
elif c in "\"'":
quote_mode = c
if level <= 0 and not quote_mode:
break
index += 1
return index+1
_numberRe = re.compile('\d+')
def numericSubstring(string, index=0):
'''
Skip a numeric substring from specified index, return that number and the
next string index.
>>> numericSubstring('127foo')
(127, 3)
>>> numericSubstring('abc765def490', index=3)
(765, 6)
It is expected that ``string[index]`` is a digit. If not, an exception may
be raised.
'''
m = _numberRe.match(string, index)
return (int(m.group()), m.end())
if __name__ == '__main__':
s = '(foo)bar[baz[bar]{bar}]'
assert 5 == balancedSubstring(s) # (foo)
assert 6 == balancedSubstring(s, 5) # b
assert 7 == balancedSubstring(s, 6) # a
assert 8 == balancedSubstring(s, 7) # r
assert 23 == balancedSubstring(s, 8) # [baz[bar]{bar}]
s = '"foo\\"bar("\')b"az\''
assert 11 == balancedSubstring(s) # "foo\"bar("
assert 18 == balancedSubstring(s,11) # ')b"az'
s = '(((foo'
assert balancedSubstring(s) > 6
assert numericSubstring('127foo') == (127, 3)
assert numericSubstring('abc765def490', index=3) == (765, 6)