In [None]:
!pip install polars




In [None]:
import polars as pl

splits = {
    'train': 'hf://datasets/blastwind/deprecated-github-code-haskell-function/data/train-*-of-*.parquet',
    'test': 'hf://datasets/blastwind/deprecated-github-code-haskell-function/data/test-*-of-*.parquet',
    'valid': 'hf://datasets/blastwind/deprecated-github-code-haskell-function/data/valid-00000-of-00001-636cb804972d8982.parquet'
}


df = pl.read_parquet(splits['train'])



In [3]:
import pickle

class CustomTokenizer:
    def __init__(self, oov_token="<UNK>"):
        """
        Initializes the tokenizer with empty mappings and an OOV (out-of-vocabulary) token.
        """
        self.word_to_id = {}
        self.id_to_word = {}
        self.oov_token = oov_token

        self.word_to_id[oov_token] = 0
        self.word_to_id["<start>"] = 1;
        self.word_to_id["<end>"] = 2;
        self.id_to_word[0] = oov_token
        self.id_to_word[1] = "<start>"
        self.id_to_word[2] = "<end>"

    def load(self, word_list):
        """
        Dynamically loads a list of words into the tokenizer and assigns free IDs.

        Args:
            word_list (list of str): The list of words to add to the tokenizer.
        """
        # Ensure all words are lowercase
        word_list = [word.lower() for word in word_list]

        # Start assigning IDs from the current maximum ID + 1
        next_id = max(self.word_to_id.values()) + 1

        for word in word_list:
            if word not in self.word_to_id:
                self.word_to_id[word] = next_id
                self.id_to_word[next_id] = word
                next_id += 1

    def encode(self, text):
        """
        Encodes a string into a list of IDs.

        Args:
            text (str): The input text to encode.

        Returns:
            list of int: List of word IDs.
        """
        words = text.lower().split()  # Convert text to lowercase and split into words
        return [self.word_to_id.get(word, self.word_to_id[self.oov_token]) for word in words]

    def decode(self, ids):
        """
        Decodes a list of IDs into a string.

        Args:
            ids (list of int): List of word IDs to decode.

        Returns:
            str: The decoded string.
        """
        return ' '.join(self.id_to_word.get(i, self.oov_token) for i in ids)

    def export(self, filename):
        """
        Exports the word_to_id and id_to_word mappings to a binary file.

        Args:
            filename (str): The path of the file to save the mappings.
        """
        with open(filename, 'wb') as f:
            pickle.dump((self.word_to_id, self.id_to_word), f)

    def import_mappings(self, filename):
        """
        Loads word_to_id and id_to_word mappings from a binary file.

        Args:
            filename (str): The path of the file to load the mappings from.
        """
        with open(filename, 'rb') as f:
            self.word_to_id, self.id_to_word = pickle.load(f)


In [4]:
import pandas as pd
dataframe = pd.read_csv('data-small.csv')
tokenizer = CustomTokenizer()
dataframe = dataframe.head(10)
tokenizer = CustomTokenizer()
for comment in dataframe["comments"]:
  tokenizer.load(comment.lower().split())

In [37]:
import pandas as pd

dataframe = pd.read_json("hf://datasets/Bry14/PHagenlocher-HaskellYTv0.1/haskell_code_desc.json")
dataframe.rename(columns={dataframe.columns[0]: "function_only_code", dataframe.columns[1]: "comments"}, inplace=True)
dataframe = dataframe.head(40)
tokenizer = CustomTokenizer()
for comment in dataframe["comments"]:
  tokenizer.load(comment.lower().split())


In [38]:
print(dataframe.head())

                          function_only_code  \
0           name arg1 arg2 ... argn = <expr>   
1                    name arg1 arg2 ... argn   
2  in_range min max x = x >= min && x <= max   
3                          sum = foldr (+) 0   
4                                 sum [] = 0   

                                            comments  
0  Define a function named 'name' with 'n' argume...  
1      Calls the function 'name' with 'n' arguments.  
2  Checks if the value of x is within the range s...  
3  Calculate the sum of a list of numbers using a...  
4  Define a function 'sum' that takes an empty li...  


In [5]:
!pip install tree_sitter==0.23.0
!pip install tree_sitter_haskell

Collecting tree_sitter==0.23.0
  Downloading tree_sitter-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading tree_sitter-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (558 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.7/558.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tree_sitter
Successfully installed tree_sitter-0.23.0
Collecting tree_sitter_haskell
  Downloading tree_sitter_haskell-0.23.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading tree_sitter_haskell-0.23.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (424 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m424.6/424.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tree_sitter_haskell
Successfully installed tree_sitter_haskell-0.23.1


In [39]:
import tree_sitter_haskell as tshaskell
import numpy as np
from tree_sitter import Language, Parser


HS_LANGUAGE = Language(tshaskell.language())
parser = Parser(HS_LANGUAGE)


In [40]:
def extract_tree_features(code):
    try:
        example_bytes = code.encode()
        tree = parser.parse(example_bytes)
        root_node = tree.root_node

        def extract_features(node, parent_index=None, nodes=[], edges=[]):
            node_type = node.type
            start_pos = node.start_point
            end_pos = node.end_point
            code_value = code[node.start_byte:node.end_byte]
            tokenizer.load(node_type.lower().split())
            feature_vector = [node_type]
            node_index = len(nodes)
            nodes.append(feature_vector)

            if parent_index is not None:
                edges.append((parent_index, node_index))

            for child in node.children:
                extract_features(child, parent_index=node_index, nodes=nodes, edges=edges)

            return nodes, edges

        nodes, edge_list = extract_features(root_node)
        code_value = code[root_node.start_byte:root_node.end_byte]
        tokenizer.load(code_value.lower().split())
        num_nodes = len(nodes)
        adj_matrix = np.zeros((num_nodes, num_nodes), dtype=np.float32)
        for parent, child in edge_list:
            adj_matrix[parent, child] = 1

        return nodes, adj_matrix
    except Exception as e:
        print(f"Error parsing code: {e}")
        return None, None
dataframe["nodes"], dataframe["adjacency_matrix"] = zip(*dataframe["function_only_code"].apply(extract_tree_features))


In [None]:
#dataframe.to_csv('data-ast-small.csv', index=False)
#print(dataframe["nodes"].size)
#tokenizer.export("tokens.bin")
import pandas as pd
dataframe = pd.read_csv('data-ast-small.csv')
tokenizer = CustomTokenizer()


FileNotFoundError: [Errno 2] No such file or directory: 'data-ast-small.csv'

[1]


In [None]:
!pip install tensorflow
!pip install keras



In [41]:
import tensorflow as tf
from keras.layers import Layer
from keras import activations
import keras.backend as K

class GCNLayer(Layer):
    def __init__(self, units, activation='relu', initializer='glorot_uniform', sparse=False, use_bias=True, **kwargs):
        self.activation = activations.get(activation)
        self.output_dim = units
        self.initializer = initializer
        self.sparse = sparse
        self.use_bias = use_bias

        super(GCNLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(name='kernel',
                                          shape=(input_shape[0][-1], self.output_dim),
                                          initializer=self.initializer,
                                          trainable=True)
        if self.use_bias:
            self.bias = self.add_weight(name='bias',
                                              shape=(self.output_dim,),
                                              initializer='zeros',
                                              trainable=True)
        else:
            self.bias = None

        super(GCNLayer, self).build(input_shape)

    def call(self, x):
        assert isinstance(x, list)
        nodes, edges = x
        identity = tf.eye(tf.shape(edges)[-1], batch_shape=[tf.shape(edges)[0]], dtype=edges.dtype)
        edges = edges + identity
        output = tf.matmul(edges,nodes)
        output = tf.matmul(output, self.kernel)

        if self.use_bias:
            output += self.bias

        return self.activation(output)

    def compute_output_shape(self, input_shape):
        assert isinstance(input_shape, list)
        return (None,input_shape[0][1], self.output_dim)

    def get_config(self):
        config = {
            'units': self.output_dim,
            'activation': activations.serialize(self.activation),
        }

        base_config = super(GCNLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


In [42]:
import keras
import keras.utils
from keras.layers import Input, Dense, Embedding, Activation, concatenate, Flatten, GRU, TimeDistributed, dot
from keras.models import Model

class CodeGNNGRU:
    def __init__(self, config):
        config['modeltype'] = 'codegnngru'

        self.config = config
        self.tdatvocabsize = config['tdatvocabsize']
        self.comvocabsize = config['comvocabsize']
        self.smlvocabsize = config['smlvocabsize']
        self.tdatlen = config['tdatlen']
        self.comlen = config['comlen']
        self.smllen = config['maxastnodes']

        self.config['batch_maker'] = 'graph_multi_1'

        self.embdims = 100
        self.smldims = 256
        self.recdims = 256
        self.tdddims = 256

    def create_model(self):

        tdat_input = Input(shape=(self.tdatlen,))
        com_input = Input(shape=(self.comlen,))
        node_input = Input(shape=(self.smllen,))
        edge_input = Input(shape=(self.smllen, self.smllen))

        tdel = Embedding(output_dim=self.embdims, input_dim=self.tdatvocabsize, mask_zero=False)
        tde = tdel(tdat_input)

        se = tdel(node_input)

        tenc = GRU(self.recdims, return_state=True, return_sequences=True)
        tencout, tstate_h = tenc(tde)

        de = Embedding(output_dim=self.embdims, input_dim=self.comvocabsize, mask_zero=False)(com_input)
        dec = GRU(self.recdims, return_sequences=True)
        decout = dec(de, initial_state=tstate_h)

        tattn = dot([decout, tencout], axes=[2, 2])
        tattn = Activation('softmax')(tattn)
        tcontext = dot([tattn, tencout], axes=[2, 1])

        astwork = se

        for i in range(self.config['asthops']):
            astwork = GCNLayer(100)([astwork, edge_input])

        astwork = GRU(self.recdims, return_sequences=True)(astwork, initial_state=tstate_h)

        aattn = dot([decout, astwork], axes=[2, 2])
        aattn = Activation('softmax')(aattn)
        acontext = dot([aattn, astwork], axes=[2, 1])

        context = concatenate([tcontext, decout, acontext])

        out = TimeDistributed(Dense(self.tdddims, activation="relu"))(context)

        out = Flatten()(out)
        out1 = Dense(self.comvocabsize, activation="softmax")(out)

        model = Model(inputs=[tdat_input, com_input, node_input, edge_input], outputs=out1)

        model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])
        return self.config, model


In [43]:

import numpy as np
import tensorflow as tf
from keras.utils import to_categorical
import ast

config = {
    'tdatvocabsize': 1000,
    'comvocabsize': 1000,
    'smlvocabsize':1000,
    'tdatlen': 200,
    'comlen': 500,
    'maxastnodes': 70,
    'asthops': 3,
}
model_instance = CodeGNNGRU(config)
config, model = model_instance.create_model()

tdat_input = np.zeros((len(dataframe) * config['comlen'], config['tdatlen']), dtype=np.float32)
com_input = np.zeros((len(dataframe) * config['comlen'], config['comlen']), dtype=np.float32)
node_input = np.zeros((len(dataframe) * config['comlen'], config['maxastnodes']), dtype=np.float32)
edge_input = np.zeros((len(dataframe) * config['comlen'], config['maxastnodes'], config['maxastnodes']), dtype=np.float32)
target_output = np.zeros((len(dataframe) * config['comlen'], config['comvocabsize']), dtype=np.float32)
sample_idx = 0

for i, (nodes, adj_matrix, tdat_text, com_text) in enumerate(zip(dataframe["nodes"], dataframe["adjacency_matrix"], dataframe["function_only_code"],dataframe["comments"])):
    tdat_encoded = tokenizer.encode(tdat_text)
    com_encoded = [1] + tokenizer.encode(com_text) + [2]
    num_nodes = min(len(nodes), int(config['maxastnodes']))
    node_encoded = []
    for node in nodes[:num_nodes]:
        encoded_node = tokenizer.encode(node[0])
        if len(encoded_node) > 1:
          node_encoded.append(encoded_node[0])

    for j in range(1,len(com_encoded)):
        target_output[sample_idx, com_encoded[j]] = 1
        tdat_input[sample_idx, :len(tdat_encoded)] = tdat_encoded[:config["tdatlen"]]
        com_input[sample_idx, :j-1] = com_encoded[:j-1]
        node_input[sample_idx, :len(node_encoded)] = node_encoded[:num_nodes]
        edge_input[sample_idx, :num_nodes, :num_nodes] = adj_matrix[:num_nodes, :num_nodes]
        sample_idx +=1

print(tdat_input.shape)
print(com_input.shape)
print(node_input.shape)
print(edge_input.shape)
print(target_output.shape)
print(tdat_input)
print(com_input)
print(node_input)
print(edge_input)
print(target_output)


optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
model.compile(
    loss='categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

model.fit(
    [tdat_input, com_input, node_input, edge_input],
    target_output,
    epochs=5
)


(20000, 200)
(20000, 500)
(20000, 70)
(20000, 70, 70)
(20000, 1000)
[[162. 163. 164. ...   0.   0.   0.]
 [162. 163. 164. ...   0.   0.   0.]
 [162. 163. 164. ...   0.   0.   0.]
 ...
 [  0.   0.   0. ...   0.   0.   0.]
 [  0.   0.   0. ...   0.   0.   0.]
 [  0.   0.   0. ...   0.   0.   0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 3. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. 

<keras.src.callbacks.History at 0x7d35682e2c50>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [45]:
model.save('code_gnn_model-better.keras')
print("Model saved to 'code_gnn_model.h5'")
model.weights

Model saved to 'code_gnn_model.h5'


[<tf.Variable 'embedding_8/embeddings:0' shape=(1000, 100) dtype=float32, numpy=
 array([[ 0.09197529, -0.14640653,  0.1047008 , ...,  0.06472567,
         -0.07449172,  0.04910017],
        [ 0.01602964, -0.04932421, -0.01084912, ...,  0.04255322,
          0.02699617,  0.02350846],
        [ 0.02330137, -0.01878303,  0.00139688, ...,  0.03589047,
          0.02151451,  0.03632164],
        ...,
        [ 0.04904309,  0.01221039, -0.00280874, ..., -0.02904597,
         -0.04618828,  0.02912838],
        [-0.01238581, -0.0098834 , -0.0329351 , ...,  0.04599004,
          0.01648423, -0.01166137],
        [ 0.00789423, -0.03085524,  0.04567391, ..., -0.04186819,
         -0.02856513, -0.02413399]], dtype=float32)>,
 <tf.Variable 'gcn_layer_12/kernel:0' shape=(100, 100) dtype=float32, numpy=
 array([[ 0.08438554,  0.03648412, -0.02219721, ...,  0.1521554 ,
         -0.14902729,  0.12448713],
        [-0.12413625,  0.10516208,  0.11596006, ..., -0.00973628,
          0.04415794, -0.032875

In [49]:
def predict_commentary_greedy(model, tokenizer, tdat_encoded, node_encoded, edge_matrix, config, max_length=20):
    """
    Predict commentary for given input in a greedy manner.

    Args:
        model: The trained CodeGNNGRU model.
        tokenizer: Tokenizer to encode/decode text.
        tdat_encoded: Encoded representation of the function code.
        node_encoded: Encoded representation of the AST nodes.
        edge_matrix: Adjacency matrix of the AST.
        config: Configuration dictionary containing lengths and vocab sizes.
        max_length: Maximum length of the generated commentary.

    Returns:
        Generated commentary string.
    """

    tdat_input = np.zeros((1, config['tdatlen']), dtype=np.float32)
    node_input = np.zeros((1, config['maxastnodes']), dtype=np.float32)
    edge_input = np.zeros((1, config['maxastnodes'], config['maxastnodes']), dtype=np.float32)
    com_input = np.zeros((1, config['comlen']), dtype=np.float32)

    tdat_input[0, :len(tdat_encoded)] = tdat_encoded[:config['tdatlen']]
    node_input[0, :len(node_encoded)] = node_encoded[:config['maxastnodes']]
    edge_input[0, :len(edge_matrix), :len(edge_matrix)] = edge_matrix[:config['maxastnodes'], :config['maxastnodes']]

    generated_commentary = [tokenizer.encode('<start>')[0]]
    print(tdat_input)
    print(node_input)
    print(edge_input)
    for _ in range(max_length):
        com_input[0, :len(generated_commentary)] = generated_commentary
        predictions = model.predict([tdat_input, com_input, node_input, edge_input], verbose=0)
        next_token = np.argmax(predictions)
        generated_commentary.append(next_token)
        decoded_commentary = tokenizer.decode(generated_commentary)
        if next_token == tokenizer.encode('<end>')[0]:
            break

    decoded_commentary = tokenizer.decode(generated_commentary)
    return decoded_commentary


In [54]:

code='''
factorial :: Integer -> Integer
factorial 0 = 1
factorial n = n * factorial (n - 1)

'''
tdat_encoded = tokenizer.encode(code)
nodes, edges = extract_tree_features(code)
num_nodes = min(len(nodes), int(config['maxastnodes']))
node_encoded = []
for node in nodes[:num_nodes]:
    encoded_node = tokenizer.encode(node[0])
    if len(encoded_node) > 1:
      node_encoded.append(encoded_node[0])
predict_commentary_greedy(model, tokenizer, tdat_encoded,node_encoded, edges, config)

[[228. 215.  94. 219.  94. 228. 182. 161. 133. 228. 229. 161. 229. 227.
  228. 230. 231. 232.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   

'<start> of of of of of of of of of of of of of of of of of of of of'

In [21]:

code='''
conferenceSolution
    :: ConferenceSolution
conferenceSolution =
  ConferenceSolution'
    {_csIconURI = Nothing, _csKey = Nothing, _csName = Nothing}","conferenceSolution
    :: ConferenceSolution
conferenceSolution =
  ConferenceSolution'
    {_csIconURI = Nothing, _csKey = Nothing, _csName = Nothing}","conferenceSolution =
  ConferenceSolution'
    {_csIconURI = Nothing, _csKey = Nothing, _csName = Nothing}","-- | Creates a value of 'ConferenceSolution' with the minimum fields required to make a request.

'''
tdat_encoded = tokenizer.encode(code)
nodes, edges = extract_tree_features(code)
num_nodes = min(len(nodes), int(config['maxastnodes']))
node_encoded = []
for node in nodes[:num_nodes]:
    encoded_node = tokenizer.encode(node[0])
    if len(encoded_node) > 1:
      node_encoded.append(encoded_node[0])
predict_commentary_greedy(model, tokenizer, tdat_encoded,node_encoded, edges, config)

'<start> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>'

In [52]:

code='''
y :: Bool
'''
tdat_encoded = tokenizer.encode(code)
nodes, edges = extract_tree_features(code)
num_nodes = min(len(nodes), int(config['maxastnodes']))
node_encoded = []
for node in nodes[:num_nodes]:
    encoded_node = tokenizer.encode(node[0])
    if len(encoded_node) > 1:
      node_encoded.append(encoded_node[0])
predict_commentary_greedy(model, tokenizer, tdat_encoded,node_encoded, edges, config)

[[197. 215. 217.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   

'<start> of of of of of of of of of of of of of of of of of of of of'

In [57]:

code='''
sum :: [Int] -> Int
'''
tdat_encoded = tokenizer.encode(code)
nodes, edges = extract_tree_features(code)
num_nodes = min(len(nodes), int(config['maxastnodes']))
node_encoded = []
for node in nodes[:num_nodes]:
    encoded_node = tokenizer.encode(node[0])
    if len(encoded_node) > 1:
      node_encoded.append(encoded_node[0])
predict_commentary_greedy(model, tokenizer, tdat_encoded,node_encoded, edges, config)

[[ 33. 215.   0. 219. 203.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   

'<start> of of of of of of of of of of of of of of of of of of of of'