In [1]:
!pip install polars




In [1]:
import polars as pl

# Define the paths for the dataset
splits = {
    'train': 'hf://datasets/blastwind/deprecated-github-code-haskell-function/data/train-*-of-*.parquet',
    'test': 'hf://datasets/blastwind/deprecated-github-code-haskell-function/data/test-*-of-*.parquet',
    'valid': 'hf://datasets/blastwind/deprecated-github-code-haskell-function/data/valid-00000-of-00001-636cb804972d8982.parquet'
}

# Load the training split using Polars
df = pl.read_parquet(splits['train'])



In [2]:
# Count the rows where "is_commented" is True
true_count = df.filter(pl.col("is_commented") == True).filter(pl.col("is_signatured") == True).height

print(f"Number of rows with 'is_commented' set to True: {true_count}")

dataframe = df.to_pandas()

dataframe = dataframe.loc[
    (dataframe["is_commented"] == True) & (dataframe["n_ast_nodes"] < 50),
    ["full_code", "uncommented_code", "function_only_code"]
]


dataframe["comments"] = dataframe.apply(
    lambda row: row["full_code"].replace(row["uncommented_code"], "").strip(), axis=1
)



Number of rows with 'is_commented' set to True: 473397


In [3]:
dataframe.size

954616

In [4]:
!pip install tree_sitter
!pip install tree_sitter_haskell



In [5]:
import tree_sitter_haskell as tshaskell
import numpy as np
from tree_sitter import Language, Parser


HS_LANGUAGE = Language(tshaskell.language())
parser = Parser(HS_LANGUAGE)


In [6]:
def extract_tree_features(code):
    try:
        example_bytes = code.encode()
        tree = parser.parse(example_bytes)
        root_node = tree.root_node

        def extract_features(node, parent_index=None, nodes=[], edges=[]):
            node_type = node.type
            start_pos = node.start_point
            end_pos = node.end_point
            code_value = code[node.start_byte:node.end_byte]
            feature_vector = [node_type, code_value, start_pos[0], start_pos[1], end_pos[0], end_pos[1]]
            node_index = len(nodes)
            nodes.append(feature_vector)

            if parent_index is not None:
                edges.append((parent_index, node_index))

            for child in node.children:
                extract_features(child, parent_index=node_index, nodes=nodes, edges=edges)

            return nodes, edges

        nodes, edge_list = extract_features(root_node)

        num_nodes = len(nodes)
        adj_matrix = np.zeros((num_nodes, num_nodes), dtype=np.float32)
        for parent, child in edge_list:
            adj_matrix[parent, child] = 1

        return nodes, adj_matrix
    except Exception as e:
        print(f"Error parsing code: {e}")
        return None, None
dataframe["nodes"], dataframe["adjacency_matrix"] = zip(*dataframe["function_only_code"].apply(extract_tree_features))




In [7]:
print(dataframe["nodes"].size)

238654


In [38]:
!pip install tensorflow
!pip install keras



In [8]:
import tensorflow as tf
from keras.layers import Layer
from keras import activations
import keras.backend as K

class GCNLayer(Layer):
    def __init__(self, units, activation='relu', initializer='glorot_uniform', sparse=False, use_bias=True, **kwargs):
        self.activation = activations.get(activation)
        self.output_dim = units
        self.initializer = initializer
        self.sparse = sparse
        self.use_bias = use_bias

        super(GCNLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(name='kernel',
                                          shape=(input_shape[0][-1], self.output_dim),
                                          initializer=self.initializer,
                                          trainable=True)
        if self.use_bias:
            self.bias = self.add_weight(name='bias',
                                              shape=(self.output_dim,),
                                              initializer='zeros',
                                              trainable=True)
        else:
            self.bias = None

        super(GCNLayer, self).build(input_shape)

    def call(self, x):
        assert isinstance(x, list)
        # # Get shapes of our inputs and weights
        nodes, edges = x
        edges += K.eye(int(edges.shape[1]))
        output = tf.matmul(edges,nodes)
        output = tf.matmul(output, self.kernel)

        if self.use_bias:
            output += self.bias

        return self.activation(output)

    def compute_output_shape(self, input_shape):
        assert isinstance(input_shape, list)
        return (None,input_shape[0][1], self.output_dim)

    def get_config(self):
        config = {
            'units': self.output_dim,
            'activation': activations.serialize(self.activation),
        }

        base_config = super(GCNLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


In [9]:
import keras
import keras.utils
from keras.layers import Input, Dense, Embedding, Activation, concatenate, Flatten, GRU, TimeDistributed, dot
from keras.models import Model

class CodeGNNGRU:
    def __init__(self, config):
        config['modeltype'] = 'codegnngru'

        self.config = config
        self.tdatvocabsize = config['tdatvocabsize']
        self.comvocabsize = config['comvocabsize']
        self.smlvocabsize = config['smlvocabsize']
        self.tdatlen = config['tdatlen']
        self.comlen = config['comlen']
        self.smllen = config['maxastnodes']

        self.config['batch_maker'] = 'graph_multi_1'

        self.embdims = 100
        self.smldims = 256
        self.recdims = 256
        self.tdddims = 256

    def create_model(self):

        tdat_input = Input(shape=(self.tdatlen,))
        com_input = Input(shape=(self.comlen,))
        node_input = Input(shape=(self.smllen,))
        edge_input = Input(shape=(self.smllen, self.smllen))

        tdel = Embedding(output_dim=self.embdims, input_dim=self.tdatvocabsize, mask_zero=False)
        tde = tdel(tdat_input)

        se = tdel(node_input)

        tenc = GRU(self.recdims, return_state=True, return_sequences=True)
        tencout, tstate_h = tenc(tde)

        de = Embedding(output_dim=self.embdims, input_dim=self.comvocabsize, mask_zero=False)(com_input)
        dec = GRU(self.recdims, return_sequences=True)
        decout = dec(de, initial_state=tstate_h)

        tattn = dot([decout, tencout], axes=[2, 2])
        tattn = Activation('softmax')(tattn)
        tcontext = dot([tattn, tencout], axes=[2, 1])

        astwork = se

        # provide a graph layer for each number of hops 1->2->3->N
        for i in range(self.config['asthops']):
            astwork = GCNLayer(100)([astwork, edge_input])

        astwork = GRU(self.recdims, return_sequences=True)(astwork, initial_state=tstate_h)

        # attend decoder words to nodes in ast
        aattn = dot([decout, astwork], axes=[2, 2])
        aattn = Activation('softmax')(aattn)
        acontext = dot([aattn, astwork], axes=[2, 1])

        context = concatenate([tcontext, decout, acontext])

        out = TimeDistributed(Dense(self.tdddims, activation="relu"))(context)

        out = Flatten()(out)
        out1 = Dense(self.comvocabsize, activation="softmax")(out)

        model = Model(inputs=[tdat_input, com_input, node_input, edge_input], outputs=out1)

        model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])
        return self.config, model


In [None]:
from keras.utils import to_categorical

# Example configuration dictionary
config = {
    'tdatvocabsize': 5000,
    'comvocabsize': 5000,
    'smlvocabsize': 5000,
    'tdatlen': 100,
    'comlen': 50,
    'maxastnodes': 70,
    'asthops': 3,
}

# Create the model
model_instance = CodeGNNGRU(config)
config, model = model_instance.create_model()

# Prepare inputs for the model
tdat_input = np.random.randint(0, config['tdatvocabsize'], (len(dataframe), config['tdatlen']))
com_input = np.random.randint(0, config['comvocabsize'], (len(dataframe), config['comlen']))

# Normalize nodes and adjacency matrices to fit input shapes
node_input = np.zeros((len(dataframe), config['maxastnodes']), dtype=np.float32)
edge_input = np.zeros((len(dataframe), config['maxastnodes'], config['maxastnodes']), dtype=np.float32)

for i, (nodes, adj_matrix) in enumerate(zip(dataframe["nodes"], dataframe["adjacency_matrix"])):
    num_nodes = min(len(nodes), config['maxastnodes'])
    node_feature_dim = 4
    node_input = np.zeros((len(dataframe), config['maxastnodes'], node_feature_dim), dtype=np.float32)
    edge_input[i, :num_nodes, :num_nodes] = adj_matrix[:num_nodes, :num_nodes]

# Create target output (dummy example)
target_output = to_categorical(np.random.randint(0, config['comvocabsize'], len(dataframe)))

# Fit the model
model.fit(
    [tdat_input, com_input, node_input, edge_input],
    target_output,
    epochs=10,
    batch_size=32,
)
