In [1]:
import re
from io import BytesIO
from pathlib import Path
from tokenize import tokenize, COMMENT

In [2]:
CODE_SAMPLE = Path(".").resolve().parent/"code_sample"/"python"
code_samples = list(filter(lambda x: not x.match("*/.ipynb_checkpoints/*"), CODE_SAMPLE.glob("**/*.py")))
code_samples

[PosixPath('/home/pyuser/workspace/code_sample/python/module1.py'),
 PosixPath('/home/pyuser/workspace/code_sample/python/module2.py'),
 PosixPath('/home/pyuser/workspace/code_sample/python/submodule/submodule1.py')]

## Test with sample code

In [3]:
code = """
import foobar
from foo import bar

# this comentary

def func():
    return None

"""

for token in tokenize(BytesIO(code.encode('utf-8')).readline):
    print(token)

TokenInfo(type=63 (ENCODING), string='utf-8', start=(0, 0), end=(0, 0), line='')
TokenInfo(type=62 (NL), string='\n', start=(1, 0), end=(1, 1), line='\n')
TokenInfo(type=1 (NAME), string='import', start=(2, 0), end=(2, 6), line='import foobar\n')
TokenInfo(type=1 (NAME), string='foobar', start=(2, 7), end=(2, 13), line='import foobar\n')
TokenInfo(type=4 (NEWLINE), string='\n', start=(2, 13), end=(2, 14), line='import foobar\n')
TokenInfo(type=1 (NAME), string='from', start=(3, 0), end=(3, 4), line='from foo import bar\n')
TokenInfo(type=1 (NAME), string='foo', start=(3, 5), end=(3, 8), line='from foo import bar\n')
TokenInfo(type=1 (NAME), string='import', start=(3, 9), end=(3, 15), line='from foo import bar\n')
TokenInfo(type=1 (NAME), string='bar', start=(3, 16), end=(3, 19), line='from foo import bar\n')
TokenInfo(type=4 (NEWLINE), string='\n', start=(3, 19), end=(3, 20), line='from foo import bar\n')
TokenInfo(type=62 (NL), string='\n', start=(4, 0), end=(4, 1), line='\n')
TokenIn

TokenInfo(type=63 (ENCODING), string='utf-8', start=(0, 0), end=(0, 0), line='')
- `type=63 (ENCODING)` token type
- `string='utf-8'` token string
- `start=(0, 0)` (srow, scol) row and column start of token
- `end=(0, 0)` row and column end of token
- `line=''` line string on which the token was found

In [4]:
for token in tokenize(BytesIO(code.encode('utf-8')).readline):
    if token.type == COMMENT:
        print(token)

TokenInfo(type=61 (COMMENT), string='# this comentary', start=(5, 0), end=(5, 16), line='# this comentary\n')


# Test with real code

In [5]:
tags = '|'.join(['BUG', 'FIXME', 'NOTE', 'TODO'])
regex = re.compile(f'(?:(?:# )({tags})(?::))')

In [6]:
with code_samples[0].open(mode='rb') as f:
    tokens = tokenize(f.readline)
    for token in tokens:
        if token.type == COMMENT: # and regex.match(token.string):
            print(token.start[0], "\t", token.string)

5 	 # NOTE: 'note' out of functions, module1, line 5, no closing brackets
7 	 # NOTE: 'note' multirow, out of functions, module1, line 7, no closing brackets
8 	 # 'note' multirow, out of functions, module1, line 8, no closing brackets
10 	 # TODO: 'todo' out of functions, module1, line 10, with closing brackets <>
14 	 # BUG: 'bug' in class, module1, line 14, brackets date 2022-01-01, multirow
15 	 # 'bug' in class, module1, line 14, brackets date 2022-01-01, multirow <d:2022-01-01>
26 	 # TODO: 'todo' in method, module1, line 26, no closing brackets
35 	 # normal coment, do not include
36 	 # FIXME: 'fixme' in function, module1, line 36, no brackets
47 	 # TODO: 'todo' in function, module1, line 47, brackets p2, multiline
48 	 # 'todo' in function, module1, line 47, brackets p2, multiline <p:2>
50 	 # normal coment, do not include
60 	 # FIXME: 'fixme' in function 'main', module1, line 60, with brackets p1 <p:1>
61 	 # normal coment, do not include
