In [173]:
import javalang
from javalang.ast import Node
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel, DataCollatorWithPadding
from anytree import AnyNode
from tqdm import tqdm


# use javalang to generate ASTs and depth-first traverse to generate ast nodes corpus
def get_token(node):
    token = 'None'
    if isinstance(node, str):
        token = node
    elif isinstance(node, set):
        token = 'Modifier'
    elif isinstance(node, Node):
        token = node.__class__.__name__
    return token


def get_child(root):
    if isinstance(root, Node):
        children = root.children
    elif isinstance(root, set):
        children = list(root)
    else:
        children = []

    def expand(nested_list):
        for item in nested_list:
            if isinstance(item, list):
                for sub_item in expand(item):
                    yield sub_item
            elif item:
                yield item

    return list(expand(children))


def get_sequence(node, sequence):
    token, children = get_token(node), get_child(node)
    # print(node,token)
    # print(children)
    if not children:
        try:
            print(token)
            print(node.position)
        except:
            pass
    if token in ['VariableDeclarator', 'MemberReference']:
        if node.children:  # some chidren are comprised by non-utf8 and will be removed
            # print([i for i in node.children if isinstance(i,str)])
            sequence.append(node.children[0])
    if isinstance(node,str):
        sequence.append(token)
    for child in children:
        get_sequence(child, sequence)


def parse_program(func):
    tokens = javalang.tokenizer.tokenize(func)
    parser = javalang.parser.Parser(tokens)
    tree = parser.parse_member_declaration()
    return tree


In [174]:
code = '''
private void updateRatingChoice ( ) { int current = m_chRating . getSelectedIndex ( ) ; m_chRating . removeAllItems ( ) ; FactionRecord fRec = ( FactionRecord ) m_chSubfaction . getSelectedItem ( ) ; if ( fRec == null ) { fRec = ( FactionRecord ) m_chFaction . getSelectedItem ( ) ; } ArrayList < String > ratingLevels = fRec . getRatingLevels ( ) ; if ( ratingLevels . isEmpty ( ) ) { ratingLevels = fRec . getRatingLevelSystem ( ) ; } if ( ratingLevels . size ( ) > _NUM ) { for ( int i = ratingLevels . size ( ) - _NUM ; i >= _NUM ; i -- ) { m_chRating . addItem ( ratingLevels . get ( i ) ) ; } } if ( current < _NUM && m_chRating . getItemCount ( ) > _NUM ) { m_chRating . setSelectedIndex ( _NUM ) ; } else { m_chRating . setSelectedIndex ( Math . min ( current , m_chRating . getItemCount ( ) - _NUM ) ) ; } }
'''

In [175]:
a = []

In [177]:
get_sequence(parse_program(code),a)

private
updateRatingChoice
int
current
m_chRating
getSelectedIndex
m_chRating
removeAllItems
FactionRecord
fRec
FactionRecord
m_chSubfaction
getSelectedItem
==
fRec
null
fRec
FactionRecord
m_chFaction
getSelectedItem
=
ArrayList
String
ratingLevels
fRec
getRatingLevels
ratingLevels
isEmpty
ratingLevels
fRec
getRatingLevelSystem
=
>
ratingLevels
size
_NUM
int
i
-
ratingLevels
size
_NUM
>=
i
_NUM
--
i
m_chRating
ratingLevels
i
get
addItem
&&
<
current
_NUM
>
m_chRating
getItemCount
_NUM
m_chRating
_NUM
setSelectedIndex
m_chRating
Math
current
-
m_chRating
getItemCount
_NUM
min
setSelectedIndex


In [36]:
print(code)
print(' '.join(a))


private void updateRatingChoice ( ) { int current = m_chRating . getSelectedIndex ( ) ; m_chRating . removeAllItems ( ) ; FactionRecord fRec = ( FactionRecord ) m_chSubfaction . getSelectedItem ( ) ; if ( fRec == null ) { fRec = ( FactionRecord ) m_chFaction . getSelectedItem ( ) ; } ArrayList < String > ratingLevels = fRec . getRatingLevels ( ) ; if ( ratingLevels . isEmpty ( ) ) { ratingLevels = fRec . getRatingLevelSystem ( ) ; } if ( ratingLevels . size ( ) > _NUM ) { for ( int i = ratingLevels . size ( ) - _NUM ; i >= _NUM ; i -- ) { m_chRating . addItem ( ratingLevels . get ( i ) ) ; } } if ( current < _NUM && m_chRating . getItemCount ( ) > _NUM ) { m_chRating . setSelectedIndex ( _NUM ) ; } else { m_chRating . setSelectedIndex ( Math . min ( current , m_chRating . getItemCount ( ) - _NUM ) ) ; } }

private updateRatingChoice int current m_chRating getSelectedIndex m_chRating removeAllItems FactionRecord fRec FactionRecord m_chSubfaction getSelectedItem == fRec null fRec Factio

In [2]:
def get_token(node):
    token = 'None'
    if isinstance(node, str):
        token = node
    elif isinstance(node, set):
        token = 'Modifier'
    elif isinstance(node, Node):
        token = node.__class__.__name__
    return token


In [1]:
def get_token(node):
    token = 'None'
    if isinstance(node, str):
        token = node
    elif isinstance(node, set):
        token = 'Modifier'
    elif isinstance(node, Node):
        token = node.__class__.__name__
    return token

SyntaxError: invalid character in identifier (<ipython-input-1-df3c08dde5b5>, line 1)

In [131]:
def python2tree(line):
    atok = asttokens.ASTTokens(line, parse=True)
    return atok, atok.tree

def traverse_python_tree(atok, root):
    iter_children = asttokens.util.iter_children_func(root)
    node_json = {}
    current_global = {}
    current_idx, global_idx = 1, 1
    for node in asttokens.util.walk(root):
        if not next(iter_children(node), None) is None:
            child_num = 0
            for child in iter_children(node):
                child_num += 1
            global_idx = global_idx + child_num
            current_global[current_idx] = global_idx
        current_idx += 1
    # print current_global
    current_idx = 1
    for node in asttokens.util.walk(root):
        # print current_idx
        # idx_upper = current_idx
        node_json["%s%s"%(Constants.NODE_FIX, current_idx)] = {"node": type(node).__name__, "children": [], "parent": None}
        # idx_upper = len(node_json)
        if not next(iter_children(node), None) is None:
            child_idx = 0
            for child in iter_children(node):
                child_idx += 1
                node_json["%s%s"%(Constants.NODE_FIX, current_idx)]['children'].insert(0, "%s%s"%(Constants.NODE_FIX, current_global[current_idx]-child_idx+1))
        else: # leaf node
            node_json["%s%s"%(Constants.NODE_FIX, current_idx)]['children'].append(atok.get_text(node))

        current_idx += 1

    # update_parent
    for k, node in node_json.items():
        children = [c for c in node['children'] if c.startswith(Constants.NODE_FIX)]
        if len(children):
            for c in children:
                node_json[c]['parent'] = k

    return node_json

In [183]:
code = '''
def get_cpu_list(ip, user, passwd):
    cmd = 'statcpu -iter 1 -t'
    showcpu_list = run_ssh_thread(ip, user, passwd, cmd)
    cpu_list = []
    line_num = 0
    for line in showcpu_list:
        line_num += 1
        if (line_num >= 3):
            cpu_stats = line.split()
            if (len(cpu_stats) > 2):
                cpu_list.append(cpu_stats[0].split(',')[0])
    return cpu_list
'''

In [184]:
atok = asttokens.ASTTokens(code, parse=True)

In [185]:
ast.dump(atok.tree)

"Module(body=[FunctionDef(name='get_cpu_list', args=arguments(args=[arg(arg='ip', annotation=None), arg(arg='user', annotation=None), arg(arg='passwd', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=[Assign(targets=[Name(id='cmd', ctx=Store())], value=Str(s='statcpu -iter 1 -t')), Assign(targets=[Name(id='showcpu_list', ctx=Store())], value=Call(func=Name(id='run_ssh_thread', ctx=Load()), args=[Name(id='ip', ctx=Load()), Name(id='user', ctx=Load()), Name(id='passwd', ctx=Load()), Name(id='cmd', ctx=Load())], keywords=[])), Assign(targets=[Name(id='cpu_list', ctx=Store())], value=List(elts=[], ctx=Load())), Assign(targets=[Name(id='line_num', ctx=Store())], value=Num(n=0)), For(target=Name(id='line', ctx=Store()), iter=Name(id='showcpu_list', ctx=Load()), body=[AugAssign(target=Name(id='line_num', ctx=Store()), op=Add(), value=Num(n=1)), If(test=Compare(left=Name(id='line_num', ctx=Load()), ops=[GtE()], comparators=[Num(n=3)]), body=[Assign

In [240]:
def visit(node,tokens):
  children = [i for i in ast.iter_child_nodes(node) if i.__class__.__name__ != 'Load' and atok.get_text(i) != '']
  # print([i.__class__.__name__ for i in ast.iter_child_nodes(node) if i.__class__.__name__ !=  'Load'])

  # print(children)
  
  is_leaf = len(children) == 0
  # print(is_leaf)

  if not is_leaf:
    print(node.__class__.__name__) 
    # print('father')
  else:
    print(node.__class__.__name__,' = ',atok.get_text(node),atok.get_text_range(node))
    tokens.append(node)

  # print('visit', node,atok.get_text(node))
  # print('child')
  # children = ast.iter_child_nodes(node)
  for child in children:
    visit(child,tokens)

In [241]:
root = atok.tree

In [242]:
tokens = []

In [243]:
visit(root,tokens)

Module
FunctionDef
arguments
arg  =  ip (18, 20)
arg  =  user (22, 26)
arg  =  passwd (28, 34)
Assign
Name  =  cmd (41, 44)
Str  =  'statcpu -iter 1 -t' (47, 67)
Assign
Name  =  showcpu_list (72, 84)
Call
Name  =  run_ssh_thread (87, 101)
Name  =  ip (102, 104)
Name  =  user (106, 110)
Name  =  passwd (112, 118)
Name  =  cmd (120, 123)
Assign
Name  =  cpu_list (129, 137)
List  =  [] (140, 142)
Assign
Name  =  line_num (147, 155)
Num  =  0 (158, 159)
For
Name  =  line (168, 172)
Name  =  showcpu_list (176, 188)
AugAssign
Name  =  line_num (198, 206)
Num  =  1 (210, 211)
If
Compare
Name  =  line_num (224, 232)
Num  =  3 (236, 237)
Assign
Name  =  cpu_stats (252, 261)
Call
Attribute
Name  =  line (264, 268)
If
Compare
Call
Name  =  len (293, 296)
Name  =  cpu_stats (297, 306)
Num  =  2 (310, 311)
Expr
Call
Attribute
Name  =  cpu_list (330, 338)
Subscript
Call
Attribute
Subscript
Name  =  cpu_stats (346, 355)
Index
Num  =  0 (356, 357)
Str  =  ',' (365, 368)
Index
Num  =  0 (370, 371)
Retu

In [244]:
code[385:393]

'cpu_list'

In [245]:
len(tokens)

32

In [248]:
{f'{atok.get_text_range(i)[0]}-{atok.get_text_range(i)[1]}':atok.get_text(i) for i in tokens}

{'18-20': 'ip',
 '22-26': 'user',
 '28-34': 'passwd',
 '41-44': 'cmd',
 '47-67': "'statcpu -iter 1 -t'",
 '72-84': 'showcpu_list',
 '87-101': 'run_ssh_thread',
 '102-104': 'ip',
 '106-110': 'user',
 '112-118': 'passwd',
 '120-123': 'cmd',
 '129-137': 'cpu_list',
 '140-142': '[]',
 '147-155': 'line_num',
 '158-159': '0',
 '168-172': 'line',
 '176-188': 'showcpu_list',
 '198-206': 'line_num',
 '210-211': '1',
 '224-232': 'line_num',
 '236-237': '3',
 '252-261': 'cpu_stats',
 '264-268': 'line',
 '293-296': 'len',
 '297-306': 'cpu_stats',
 '310-311': '2',
 '330-338': 'cpu_list',
 '346-355': 'cpu_stats',
 '356-357': '0',
 '365-368': "','",
 '370-371': '0',
 '385-393': 'cpu_list'}

In [226]:
code_tokens = atok.get_tokens(root)

In [232]:
{f'{i.startpos}-{i.endpos}':i.string for i in atok.get_tokens(root) if i.string.strip().strip('\n')}

{'1-4': 'def',
 '5-17': 'get_cpu_list',
 '17-18': '(',
 '18-20': 'ip',
 '20-21': ',',
 '22-26': 'user',
 '26-27': ',',
 '28-34': 'passwd',
 '34-35': ')',
 '35-36': ':',
 '41-44': 'cmd',
 '45-46': '=',
 '47-67': "'statcpu -iter 1 -t'",
 '72-84': 'showcpu_list',
 '85-86': '=',
 '87-101': 'run_ssh_thread',
 '101-102': '(',
 '102-104': 'ip',
 '104-105': ',',
 '106-110': 'user',
 '110-111': ',',
 '112-118': 'passwd',
 '118-119': ',',
 '120-123': 'cmd',
 '123-124': ')',
 '129-137': 'cpu_list',
 '138-139': '=',
 '140-141': '[',
 '141-142': ']',
 '147-155': 'line_num',
 '156-157': '=',
 '158-159': '0',
 '164-167': 'for',
 '168-172': 'line',
 '173-175': 'in',
 '176-188': 'showcpu_list',
 '188-189': ':',
 '198-206': 'line_num',
 '207-209': '+=',
 '210-211': '1',
 '220-222': 'if',
 '223-224': '(',
 '224-232': 'line_num',
 '233-235': '>=',
 '236-237': '3',
 '237-238': ')',
 '238-239': ':',
 '252-261': 'cpu_stats',
 '262-263': '=',
 '264-268': 'line',
 '268-269': '.',
 '269-274': 'split',
 '274-275

In [136]:
tokens

['ip',
 'user',
 'passwd',
 'cmd',
 "'statcpu -iter 1 -t'",
 'showcpu_list',
 'run_ssh_thread',
 'ip',
 'user',
 'passwd',
 'cmd',
 'cpu_list',
 '[]',
 'line_num',
 '0',
 'line',
 'showcpu_list',
 'line_num',
 '1',
 'line_num',
 '3',
 'cpu_stats',
 'line',
 'len',
 'cpu_stats',
 '2',
 'cpu_list',
 'cpu_stats',
 '0',
 "','",
 '0',
 'cpu_list']

In [137]:
' '.join(tokens)

"ip user passwd cmd 'statcpu -iter 1 -t' showcpu_list run_ssh_thread ip user passwd cmd cpu_list [] line_num 0 line showcpu_list line_num 1 line_num 3 cpu_stats line len cpu_stats 2 cpu_list cpu_stats 0 ',' 0 cpu_list"

In [None]:
code = '''
def get_cpu_list(ip, user, passwd):
    cmd = 'statcpu -iter 1 -t'
    showcpu_list = run_ssh_thread(ip, user, passwd, cmd)
    cpu_list = []
    line_num = 0
    for line in showcpu_list:
        line_num += 1
        if (line_num >= 3):
            cpu_stats = line.split()
            if (len(cpu_stats) > 2):
                cpu_list.append(cpu_stats[0].split(',')[0])
    return cpu_list
'''

In [None]:
# use javalang to generate ASTs and depth-first traverse to generate ast nodes corpus
def get_token(node):
    token = 'None'
    if isinstance(node, str):
        token = node
    elif isinstance(node, set):
        token = 'Modifier'
    elif isinstance(node, Node):
        token = node.__class__.__name__
    return token


def get_child(root):
    if isinstance(root, Node):
        children = root.children
    elif isinstance(root, set):
        children = list(root)
    else:
        children = []

    def expand(nested_list):
        for item in nested_list:
            if isinstance(item, list):
                for sub_item in expand(item):
                    yield sub_item
            elif item:
                yield item

    return list(expand(children))


def get_sequence(node, sequence):
    token, children = get_token(node), get_child(node)
    sequence.append(token)
    for child in children:
        get_sequence(child, sequence)


def parse_program(func):
    tokens = javalang.tokenizer.tokenize(func)
    parser = javalang.parser.Parser(tokens)
    tree = parser.parse_member_declaration()
    return tree
