In [22]:
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t', names=['pseudocode', 'cpp_code', 'workerid', 'probid', 'subid', 'line', 'indent'])
    df['pseudocode'] = '<sos> ' + df['pseudocode'] + ' <eos>'
    df['cpp_code'] = '<sos> ' + df['cpp_code'] + ' <eos>'
    return df[['cpp_code', 'pseudocode']].dropna()

train_data = load_data('spoc-train-train.tsv')

num_words = 20000
max_len = 150

cpp_tokenizer = Tokenizer(num_words=num_words, filters='', lower=False)
cpp_tokenizer.fit_on_texts(train_data['cpp_code'])
X_train = cpp_tokenizer.texts_to_sequences(train_data['cpp_code'])
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')

pseudocode_tokenizer = Tokenizer(num_words=num_words, filters='', lower=True)
pseudocode_tokenizer.fit_on_texts(train_data['pseudocode'])
y_train = pseudocode_tokenizer.texts_to_sequences(train_data['pseudocode'])
y_train = pad_sequences(y_train, maxlen=max_len, padding='post')

  df = pd.read_csv(file_path, sep='\t', names=['pseudocode', 'cpp_code', 'workerid', 'probid', 'subid', 'line', 'indent'])


In [23]:

with open('cpp_tokenizer.pkl', 'wb') as f:
    pickle.dump(cpp_tokenizer, f)
with open('pseudocode_tokenizer.pkl', 'wb') as f:
    pickle.dump(pseudocode_tokenizer, f)


In [24]:

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % num_heads == 0
        self.depth = d_model // num_heads
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v):
        batch_size = tf.shape(q)[0]
        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)
        attn_output = tf.nn.softmax(tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(tf.cast(self.depth, tf.float32)))
        attn_output = tf.matmul(attn_output, v)
        attn_output = tf.transpose(attn_output, perm=[0, 2, 1, 3])
        return self.dense(tf.reshape(attn_output, (batch_size, -1, self.d_model)))

class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff):
        super(TransformerEncoder, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, x):
        attn_output = self.attention(x, x, x)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        return {"d_model": self.d_model, "num_heads": self.num_heads, "dff": self.dff}

class TransformerDecoder(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff):
        super(TransformerDecoder, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.attention1 = MultiHeadAttention(d_model, num_heads)
        self.attention2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, x, enc_output):
        attn1 = self.attention1(x, x, x)
        out1 = self.layernorm1(x + attn1)
        attn2 = self.attention2(out1, enc_output, enc_output)
        out2 = self.layernorm2(out1 + attn2)
        ffn_output = self.ffn(out2)
        return self.layernorm3(out2 + ffn_output)

    def get_config(self):
        return {"d_model": self.d_model, "num_heads": self.num_heads, "dff": self.dff}

@tf.keras.utils.register_keras_serializable()
class Transformer(tf.keras.Model):
    def __init__(self, vocab_size, d_model, num_heads, dff, max_len):
        super(Transformer, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.max_len = max_len

        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, input_length=max_len)
        self.encoder = TransformerEncoder(d_model, num_heads, dff)
        self.decoder = TransformerDecoder(d_model, num_heads, dff)
        self.final_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')

    def call(self, inputs):
        enc_output = self.encoder(self.embedding(inputs))
        dec_output = self.decoder(self.embedding(inputs), enc_output)
        return self.final_layer(dec_output)

    def get_config(self):
        return {
            "vocab_size": self.vocab_size,
            "d_model": self.d_model,
            "num_heads": self.num_heads,
            "dff": self.dff,
            "max_len": self.max_len,
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)



In [27]:
transformer = Transformer(num_words, d_model=128, num_heads=4, dff=512, max_len=max_len)
transformer.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

transformer.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1)

transformer.save('transformer_model.keras')

Epoch 1/10
[1m2558/2558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 99ms/step - accuracy: 0.9556 - loss: 0.6735 - val_accuracy: 0.9662 - val_loss: 0.1964
Epoch 2/10
[1m2558/2558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 93ms/step - accuracy: 0.9685 - loss: 0.1766 - val_accuracy: 0.9666 - val_loss: 0.1877
Epoch 3/10
[1m2558/2558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 93ms/step - accuracy: 0.9697 - loss: 0.1583 - val_accuracy: 0.9666 - val_loss: 0.1866
Epoch 4/10
[1m2558/2558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 93ms/step - accuracy: 0.9702 - loss: 0.1493 - val_accuracy: 0.9667 - val_loss: 0.1871
Epoch 5/10
[1m2558/2558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 93ms/step - accuracy: 0.9708 - loss: 0.1435 - val_accuracy: 0.9666 - val_loss: 0.1879
Epoch 6/10
[1m2558/2558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 93ms/step - accuracy: 0.9712 - loss: 0.1397 - val_accuracy: 0.9665 - val_loss: 0.189

In [30]:
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

with open('cpp_tokenizer.pkl', 'rb') as f:
    cpp_tokenizer = pickle.load(f)
with open('pseudocode_tokenizer.pkl', 'rb') as f:
    pseudocode_tokenizer = pickle.load(f)

transformer = tf.keras.models.load_model('transformer_model.keras', compile=False)

def generate_pseudocode(cpp_code, max_len=150):
    input_seq = cpp_tokenizer.texts_to_sequences(["<sos> " + cpp_code + " <eos>"])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')

    pred_seq = transformer.predict(input_seq)
    pred_indices = np.argmax(pred_seq, axis=-1)[0]
    pseudo_tokens = [pseudocode_tokenizer.index_word.get(idx, '') for idx in pred_indices if idx > 0]

    return ' '.join(pseudo_tokens).replace('<sos>', '').replace('<eos>', '').strip()

cpp_example = "int x"
pseudo_output = generate_pseudocode(cpp_example)
print("Generated Pseudocode:\n", pseudo_output)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Generated Pseudocode:
 x integer x
