-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenization.py
128 lines (107 loc) · 5.19 KB
/
tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import collections, os
import regex as re
Token = collections.namedtuple('Token', ['type_', 'value_'])
from abc import abstractmethod
class UnexpectedTokenException(Exception):
pass
class EmptyProgramException(Exception):
'''In fn tokenizer:get_lines(), positions are empty, most probably the input program \
is without any newline characters or has a special character such as ^A'''
pass
class FailedTokenizationException(Exception):
'''Failed to create line-wise id_sequence or literal_sequence or both'''
pass
class Tokenizer:
@abstractmethod
def tokenize(self, code, keep_format_specifiers=False, keep_names=True, \
keep_literals=False):
return NotImplemented
# Use for a single line
class C_Tokenizer(Tokenizer):
_keywords = set(['auto', 'break', 'case', 'const', 'continue', 'default',
'do', 'else', 'enum', 'extern', 'for', 'goto', 'if',
'register', 'return', 'sizeof', 'static', 'switch',
'typedef', 'void', 'volatile', 'while', 'EOF', 'NULL', 'endl',
'null', 'struct', 'union'] + \
[
'alignas', 'alignof', 'and', 'and_eq', 'asm', 'atomic_cancel',
'atomic_commit', 'atomic_noexcept', 'auto', 'bitand', 'bitor',
'break', 'case', 'catch',
'class', 'co_await', 'co_return', 'co_yield', 'compl', 'concept', 'const',
'const_cast', 'consteval', 'constexpr', 'continue', 'decltype', 'default',
'delete', 'do', 'dynamic_cast', 'else', 'enum', 'explicit',
'export', 'extern', 'false', 'for', 'friend', 'goto', 'if',
'import', 'inline', 'module', 'mutable', 'namespace', 'new',
'noexcept', 'not', 'not_eq', 'nullptr', 'operator', 'or', 'or_eq',
'private', 'protected', 'public', 'reflexpr', 'register', 'reinterpret_cast',
'requires', 'return', 'sizeof', 'static', 'static_assert',
'static_cast', 'struct', 'switch', 'synchronized', 'template', 'this',
'thread_local', 'throw', 'true', 'try', 'typedef', 'typeid', 'typename',
'union', 'using', 'virtual', 'void', 'volatile',
'while', 'xor', 'xor_eq',
])
_includes = set(['stdio.h', 'stdlib.h', 'string.h', 'math.h', 'malloc.h',
'stdbool.h', 'cstdio', 'cstdio.h', 'iostream', 'conio.h'])
_includes.update(["<" +inc+ ">" for inc in _includes] + ["<string>", "<bits/stdc++.h>"])
_calls = set(['printf', 'scanf', 'cin', 'cout', 'clrscr', 'getch', 'strlen',
'gets', 'fgets', 'getchar', 'main', 'malloc', 'calloc', 'free', 'sort'])
_types = set(['char', 'double', 'float', 'int', 'long', 'short', 'unsigned'] + ['signed', 'char16_t', 'char32_t', 'char8_t', 'wchar_t', 'string', 'bool'])
_ops = set('(|)|[|]|{|}|->|<<|>>|**|&&|--|++|-=|+=|*=|&=|%=|/=|==|<=|>=|!=|-|<|>|~|!|%|^|&|*|/|+|=|?|.|,|:|;|#'.split('|') + ['||','|=','|'])
def _tokenize_code(self, code):
token_specification = [
('comment',
r'\/\*(?:[^*]|\*(?!\/))*\*\/|\/\*([^*]|\*(?!\/))*\*?|\/\/[^\n]*'),
('directive', r'#\w+'),
('string', r'"(?:[^"\n]|\\")*"?'),
('char', r"'(?:\\?[^'\n]|\\')'"),
('char_continue', r"'[^']*"),
('number', r'[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?'),
('include', r'(?<=\#include) *<([_A-Za-z]\w*(?:\.h))?>'),
('op',
r'\(|\)|\[|\]|{|}|->|<<|>>|\*\*|\|\||&&|--|\+\+|[-+*|&%\/=<>!]=|[-<>~!%^&*\/+=?|.,:;#]'),
('name', r'[_A-Za-z]\w*'),
('whitespace', r'\s+'),
('nl', r'\\\n?'),
('MISMATCH', r'.'), # Any other character
]
tok_regex = '|'.join('(?P<%s>%s)' %
pair for pair in token_specification)
line_num = 1
line_start = 0
for mo in re.finditer(tok_regex, code):
kind = mo.lastgroup
value = mo.group(kind)
yield Token(kind, value)
def tokenize(self, code, keep_format_specifiers=False, keep_names=True,
keep_literals=False):
"""
return:
aligned lists [tok, tok, ...], [type, type, ...]
skip whitespace?
"""
ret_toks = []
ret_types = []
# Get the iterable
my_gen = self._tokenize_code(code)
while True:
try:
token = next(my_gen)
except StopIteration:
break
if isinstance(token, Exception):
return ret_toks, ret_types
type_ = token.type_
value = token.value_
if type_ == 'whitespace':
continue
if value in self._types:
type_ = "type"
elif value in self._calls:
type_ = "call"
elif value in self._keywords:
type_ = "keyword"
if len(ret_toks) >1 and ret_toks[-1] == '.' and type_ == 'name': # s.push_back()
type_ = "call"
ret_toks.append(value)
ret_types.append(type_)
return ret_toks, ret_types