In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, MultiHeadAttention, Dense, Dropout, LayerNormalization, Add
from sklearn.model_selection import train_test_split
import pickle

data_path = 'cleaned_pseudocode_cpp.csv'
df = pd.read_csv(data_path)

tokenizer_pseudo = Tokenizer()
tokenizer_cpp = Tokenizer()
tokenizer_pseudo.fit_on_texts(df['text'])
tokenizer_cpp.fit_on_texts(df['code'])

pseudo_sequences = tokenizer_pseudo.texts_to_sequences(df['text'])
cpp_sequences = tokenizer_cpp.texts_to_sequences(df['code'])

max_seq_len = max(max(len(seq) for seq in pseudo_sequences), max(len(seq) for seq in cpp_sequences))

pseudo_padded = pad_sequences(pseudo_sequences, maxlen=max_seq_len, padding='post')
cpp_padded = pad_sequences(cpp_sequences, maxlen=max_seq_len + 1, padding='post')

X_train, X_test, y_train, y_test = train_test_split(pseudo_padded, cpp_padded, test_size=0.2)

In [None]:
def positional_encoding(seq_len, embed_dim):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(embed_dim)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(embed_dim))
    angle_rads = pos * angle_rates
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pe = tf.convert_to_tensor(angle_rads, dtype=tf.float32)
    return tf.expand_dims(pe, axis=0)

def transformer_block(inputs, embed_dim, num_heads, ff_dim, dropout=0.1):
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    attn_output = Dropout(dropout)(attn_output)
    attn_output = Add()([inputs, attn_output])
    attn_output = LayerNormalization(epsilon=1e-6)(attn_output)

    ffn_output = Dense(ff_dim, activation="relu")(attn_output)
    ffn_output = Dense(embed_dim)(ffn_output)
    ffn_output = Dropout(dropout)(ffn_output)
    ffn_output = Add()([attn_output, ffn_output])
    return LayerNormalization(epsilon=1e-6)(ffn_output)

from tensorflow.keras.layers import Lambda

def encoder(vocab_size, max_seq_len, embed_dim, num_heads, ff_dim, num_layers):
    inputs = Input(shape=(max_seq_len,))
    embedding_layer = Embedding(vocab_size, embed_dim)(inputs)

    position_encoding = positional_encoding(max_seq_len, embed_dim)
    position_encoding_layer = Lambda(lambda x: x + position_encoding)(embedding_layer)

    x = position_encoding_layer
    for _ in range(num_layers):
        x = transformer_block(x, embed_dim, num_heads, ff_dim)

    return Model(inputs, x)

def decoder(vocab_size, max_seq_len, embed_dim, num_heads, ff_dim, num_layers):
    inputs = Input(shape=(max_seq_len,))
    embedding_layer = Embedding(vocab_size, embed_dim)(inputs)

    position_encoding = positional_encoding(max_seq_len, embed_dim)
    position_encoding_layer = Lambda(lambda x: x + position_encoding)(embedding_layer)

    x = position_encoding_layer
    for _ in range(num_layers):
        x = transformer_block(x, embed_dim, num_heads, ff_dim)

    return Model(inputs, x)

In [None]:


embed_dim = 256
num_heads = 8
ff_dim = 512
num_layers = 4
input_vocab_size = len(tokenizer_pseudo.word_index) + 1
output_vocab_size = len(tokenizer_cpp.word_index) + 1

encoder_model = encoder(input_vocab_size, max_seq_len, embed_dim, num_heads, ff_dim, num_layers)
decoder_model = decoder(output_vocab_size, max_seq_len, embed_dim, num_heads, ff_dim, num_layers)

encoder_inputs = Input(shape=(max_seq_len,))
decoder_inputs = Input(shape=(max_seq_len,))

encoder_outputs = encoder_model(encoder_inputs)
decoder_outputs = decoder_model(decoder_inputs)

final_outputs = Dense(output_vocab_size, activation="softmax")(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], final_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

epochs = 20
batch_size = 32
model.fit([X_train, y_train[:, :-1]], y_train[:, 1:], epochs=epochs, batch_size=batch_size, validation_data=([X_test, y_test[:, :-1]], y_test[:, 1:]))



In [71]:
import pickle

with open('tokenizer_pseudo.pkl', 'wb') as f:
    pickle.dump(tokenizer_pseudo, f)

with open('tokenizer_cpp.pkl', 'wb') as f:
    pickle.dump(tokenizer_cpp, f)

model.save('transformer_model.keras')


In [2]:
import numpy as np
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

with open('tokenizer_pseudo.pkl', 'rb') as f:
    tokenizer_pseudo = pickle.load(f)

with open('tokenizer_cpp.pkl', 'rb') as f:
    tokenizer_cpp = pickle.load(f)

model = load_model('transformer_model.keras')

def generate_code(pseudo_text, max_seq_len=100):
    pseudo_seq = tokenizer_pseudo.texts_to_sequences([pseudo_text])
    pseudo_padded = pad_sequences(pseudo_seq, maxlen=max_seq_len, padding='post')

    generated_tokens = [tokenizer_cpp.word_index['<start>']]

    for _ in range(max_seq_len):
        decoder_input = pad_sequences([generated_tokens], maxlen=max_seq_len, padding='post')
        pred_probs = model.predict([pseudo_padded, decoder_input], verbose=0)
        next_token = np.argmax(pred_probs[0, len(generated_tokens) - 1])

        if next_token == tokenizer_cpp.word_index.get('<end>', 0):
            break

        generated_tokens.append(next_token)

    cpp_code = ' '.join(tokenizer_cpp.index_word.get(token, '') for token in generated_tokens if token > 0)
    return cpp_code



In [None]:

ui = input("Enter pseudocode: ")
generated_cpp = generate_code(ui)
print("Generated C++ Code:", generated_cpp)