In [1]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.43.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.43.0-py2.py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.43.0


In [2]:
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import time
import streamlit as st
from transformers import BertTokenizer

In [3]:


# --- Data Loading and Preprocessing ---

def load_spoc_data_reversed(file_path, code_col='code', text_col='text'): # Reversed column order
    """Loads SPoC data from a TSV file, C++ as source, pseudocode as target."""
    df = pd.read_csv(file_path, sep='\t')
    # Filter out rows where either 'text' or 'code' is NaN
    df_clean = df.dropna(subset=[text_col, code_col])
    return df_clean[code_col].tolist(), df_clean[text_col].tolist() # Return code first, then text

# Load BERT tokenizer (no change)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
PAD_TOKEN = bert_tokenizer.pad_token
UNK_TOKEN = bert_tokenizer.unk_token
BOS_TOKEN = bert_tokenizer.cls_token
EOS_TOKEN = bert_tokenizer.sep_token


def tokenize_and_numericalize_bert(texts, codes, tokenizer, max_len=128): # No change needed in tokenization function itself
    """Tokenizes, numericalizes, and pads text and code sequences using BERT tokenizer."""
    text_sequences_numericalized = []
    code_sequences_numericalized = []

    for text, code in zip(texts, codes): # 'texts' will now be C++ code, 'codes' will be pseudocode
        encoded_text = tokenizer.encode(
            BOS_TOKEN + " " + text + " " + EOS_TOKEN,
            add_special_tokens=False,
            max_length=max_len,
            truncation=True,
            padding='max_length'
        )
        encoded_code = tokenizer.encode(
            BOS_TOKEN + " " + code + " " + EOS_TOKEN,
            add_special_tokens=False,
            max_length=max_len,
            truncation=True,
            padding='max_length'
        )

        text_sequences_numericalized.append(torch.tensor(encoded_text))
        code_sequences_numericalized.append(torch.tensor(encoded_code))

    text_sequences_padded = torch.stack(text_sequences_numericalized)
    code_sequences_padded = torch.stack(code_sequences_numericalized)

    return text_sequences_padded, code_sequences_padded

class SPOCDataset(Dataset): # No change needed for Dataset class
    def __init__(self, text_sequences, code_sequences):
        self.text_sequences = text_sequences
        self.code_sequences = code_sequences

    def __len__(self):
        return len(self.text_sequences)

    def __getitem__(self, idx):
        return self.text_sequences[idx], self.code_sequences[idx]


# --- Main Data Preparation (Reversed) ---
train_file_path = "/kaggle/input/spoc-train/spoc-train.tsv"
train_codes, train_texts = load_spoc_data_reversed(train_file_path) # Load with reversed order

# Tokenize and numericalize using BERT tokenizer - No change needed here, roles swapped in loaded data
MAX_SEQ_LEN = 128
train_code_numericalized, train_text_numericalized = tokenize_and_numericalize_bert( # Swapped variable names to reflect new roles
    train_codes, train_texts, bert_tokenizer, max_len=MAX_SEQ_LEN # 'train_codes' is now source, 'train_texts' is target
)

# Create Dataset and DataLoader - No change needed
train_dataset = SPOCDataset(train_code_numericalized, train_text_numericalized) # Swapped dataset inputs
BATCH_SIZE = 128
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)


# --- Vocabulary Sizes (BERT tokenizer - no change in size, just labels) ---
CODE_VOCAB_SIZE = bert_tokenizer.vocab_size # Now C++ vocab size (source)
TEXT_VOCAB_SIZE = bert_tokenizer.vocab_size # Now Pseudocode vocab size (target)


print(f"C++ Vocabulary Size (BERT): {CODE_VOCAB_SIZE}") # Updated print statement
print(f"Pseudocode Vocabulary Size (BERT): {TEXT_VOCAB_SIZE}") # Updated print statement
print(f"Number of training examples: {len(train_dataset)}")

text_batch, code_batch = next(iter(train_dataloader)) # 'text_batch' is now C++ batch, 'code_batch' is pseudocode batch
print("Example C++ batch shape:", text_batch.shape) # Updated print statement
print("Example pseudocode batch shape:", code_batch.shape) # Updated print statement


# --- Placeholders for Tokenizers and Vocabs (Roles Swapped) ---
cpp_tokenizer = bert_tokenizer # C++ tokenizer (source)
ps_tokenizer = bert_tokenizer # Pseudocode tokenizer (target)
cpp_vocab_size = CODE_VOCAB_SIZE # Source vocab size
ps_vocab_size = TEXT_VOCAB_SIZE # Target vocab size


# --- Device Configuration (No change) ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


# --- Transformer Model Implementation (No change - same model architecture) ---
class Transformer(nn.Module): # ... (rest of Transformer, MHA, EncoderLayer, DecoderLayer, PFFN classes - same as before)
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=256, num_heads=8,
                 num_layers=3, d_ff=512, dropout=0.1, max_len=128):
        super(Transformer, self).__init__()

        # Embedding layers
        self.src_embedding = nn.Embedding(src_vocab_size, d_model) # Source embedding (C++)
        self.trg_embedding = nn.Embedding(trg_vocab_size, d_model) # Target embedding (Pseudocode)
        self.dropout = nn.Dropout(dropout)
        # ... (rest of Transformer __init__, make_src_mask, make_trg_mask, forward methods - same as before)
        # Positional encoding
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

        # Encoder and decoder layers
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])

        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])

        # Output layer
        self.output_layer = nn.Linear(d_model, trg_vocab_size)

        # Other parameters
        self.scale = math.sqrt(d_model)
        self.src_pad_idx = bert_tokenizer.pad_token_id # Use BERT pad token id
        self.trg_pad_idx = bert_tokenizer.pad_token_id # Use BERT pad token id


    def make_src_mask(self, src):
        # Create mask for padding in source (1 for tokens, 0 for padding)
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        # Create mask for padding and look-ahead
        trg_len = trg.shape[1]

        # Padding mask (1 for tokens, 0 for padding)
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

        # Look-ahead mask (lower triangular matrix)
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=trg.device)).bool()
        trg_sub_mask = trg_sub_mask.unsqueeze(0).unsqueeze(0)

        # Combine masks
        trg_mask = trg_pad_mask & trg_sub_mask

        return trg_mask

    def forward(self, src, trg):
        # Create masks
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        # Source embedding and positional encoding
        src = self.src_embedding(src) * self.scale
        src = src + self.pe[:, :src.size(1)].to(src.device)
        src = self.dropout(src)

        # Encoder
        enc_output = src
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        # Target embedding and positional encoding
        trg = self.trg_embedding(trg) * self.scale
        trg = trg + self.pe[:, :trg.size(1)].to(trg.device)
        trg = self.dropout(trg)

        # Decoder
        dec_output = trg
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, trg_mask, src_mask)

        # Output
        output = self.output_layer(dec_output)

        return output

class MultiHeadAttention(nn.Module): # ... (MHA class - same as before)
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        # ... (rest of MHA __init__, forward methods - same as before)
        assert d_model % num_heads == 0

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)

        self.fc = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim]))

    def forward(self, q, k, v, mask=None):
        batch_size = q.shape[0]

        # Linear projections and reshape
        q = self.q_linear(q).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        k = self.k_linear(k).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        v = self.v_linear(v).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        # Attention
        energy = torch.matmul(q, k.permute(0, 1, 3, 2)) / self.scale.to(q.device)

        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)

        attention = self.dropout(F.softmax(energy, dim=-1))
        output = torch.matmul(attention, v)

        # Reshape and concat heads
        output = output.permute(0, 2, 1, 3).contiguous()
        output = output.view(batch_size, -1, self.d_model)

        # Final linear projection
        output = self.fc(output)

        return output

class PositionwiseFeedforward(nn.Module): # ... (PFFN class - same as before)
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedforward, self).__init__()
        # ... (rest of PFFN __init__, forward methods - same as before)
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.fc2(self.dropout(F.relu(self.fc1(x))))


class EncoderLayer(nn.Module): # ... (EncoderLayer class - same as before)
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        # ... (rest of EncoderLayer __init__, forward methods - same as before)
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # Self-attention with residual connection and layer norm
        attn_out = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_out))

        # Feed forward with residual connection and layer norm
        ff_out = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_out))

        return x

class DecoderLayer(nn.Module): # ... (DecoderLayer class - same as before)
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        # ... (rest of DecoderLayer __init__, forward methods - same as before)
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.enc_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, trg_mask, src_mask):
        # Self-attention with residual connection and layer norm
        self_attn_out = self.self_attn(x, x, x, trg_mask)
        x = self.norm1(x + self.dropout(self_attn_out))

        # Encoder-decoder attention with residual connection and layer norm
        enc_attn_out = self.enc_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(enc_attn_out))

        # Feed forward with residual connection and layer norm
        ff_out = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_out))

        return x


# --- Model Initialization (Roles Swapped in vocab sizes) ---
model = Transformer(
    src_vocab_size=cpp_vocab_size, # C++ vocab size (source)
    trg_vocab_size=ps_vocab_size, # Pseudocode vocab size (target)
    d_model=256,
    num_heads=8,
    num_layers=3,
    d_ff=512,
    dropout=0.1
).to(device)
print(f'Model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad):,} parameters')


# --- Training Setup (No change needed in training loop logic) ---
LEARNING_RATE = 0.0001
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=cpp_tokenizer.pad_token_id) # Pad token ID is the same for both tokenizers

NUM_EPOCHS = 20
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for batch_idx, (src, trg) in enumerate(dataloader): # 'src' is now C++ batch, 'trg' is pseudocode batch
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg[:, :-1]) # 'src' is C++, 'trg' is pseudocode

        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg_reshape = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, trg_reshape)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

C++ Vocabulary Size (BERT): 30522
Pseudocode Vocabulary Size (BERT): 30522
Number of training examples: 216225
Example C++ batch shape: torch.Size([128, 128])
Example pseudocode batch shape: torch.Size([128, 128])
Using device: cuda
Model initialized with 27,425,082 parameters


# *Training Code*

In [4]:
def train(model, train_dataloader, optimizer, criterion, num_epochs, device):
    for epoch in range(num_epochs):
        start_time = time.time()
        train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device)
        end_time = time.time()
        epoch_mins = int((end_time - start_time) / 60)
        epoch_secs = int((end_time - start_time) - (epoch_mins * 60))

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')


# --- Run Training ---
train(model, train_dataloader, optimizer, criterion, NUM_EPOCHS, device)

Epoch: 01 | Time: 8m 37s
	Train Loss: 2.363 | Train PPL:  10.618
Epoch: 02 | Time: 8m 36s
	Train Loss: 1.225 | Train PPL:   3.405
Epoch: 03 | Time: 8m 36s
	Train Loss: 1.018 | Train PPL:   2.767
Epoch: 04 | Time: 8m 36s
	Train Loss: 0.904 | Train PPL:   2.469
Epoch: 05 | Time: 8m 36s
	Train Loss: 0.829 | Train PPL:   2.290
Epoch: 06 | Time: 8m 36s
	Train Loss: 0.775 | Train PPL:   2.170
Epoch: 07 | Time: 8m 36s
	Train Loss: 0.733 | Train PPL:   2.081
Epoch: 08 | Time: 8m 35s
	Train Loss: 0.701 | Train PPL:   2.016
Epoch: 09 | Time: 8m 36s
	Train Loss: 0.675 | Train PPL:   1.963
Epoch: 10 | Time: 8m 36s
	Train Loss: 0.652 | Train PPL:   1.919
Epoch: 11 | Time: 8m 35s
	Train Loss: 0.632 | Train PPL:   1.881
Epoch: 12 | Time: 8m 35s
	Train Loss: 0.615 | Train PPL:   1.850
Epoch: 13 | Time: 8m 35s
	Train Loss: 0.600 | Train PPL:   1.822
Epoch: 14 | Time: 8m 35s
	Train Loss: 0.587 | Train PPL:   1.798
Epoch: 15 | Time: 8m 35s
	Train Loss: 0.575 | Train PPL:   1.778
Epoch: 16 | Time: 8m 35s


In [5]:
# --- Save the trained model and related information (Roles Swapped in names) ---
MODEL_SAVE_PATH = 'cpp_to_pseudocode_model_bert_20epoch.pth' # Updated model save path
TOKENIZER_INFO_PATH = "tokenizer_info_cpp_to_ps_bert_20epoch.txt" # Updated tokenizer info path

torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'src_vocab_size': cpp_vocab_size, # C++ vocab size
    'trg_vocab_size': ps_vocab_size, # Pseudocode vocab size
    'max_length': MAX_SEQ_LEN,
    'd_model': 256,
    'num_heads': 8,
    'num_layers': 3,
    'd_ff': 512,
    'dropout': 0.1
}, MODEL_SAVE_PATH)

print(f"Model saved successfully to: {MODEL_SAVE_PATH}")
with open(TOKENIZER_INFO_PATH, "w") as f:
    f.write(f"C++ vocab size (BERT): {cpp_vocab_size}\n") # Updated tokenizer info saving
    f.write(f"Pseudocode vocab size (BERT): {ps_vocab_size}\n") # Updated tokenizer info saving
    f.write(f"Tokenizer type: bert-base-uncased\n")

print(f"Tokenizer information saved to: {TOKENIZER_INFO_PATH}")





# --- Streamlit Deployment (Will be addressed later) ---

Model saved successfully to: cpp_to_pseudocode_model_bert_20epoch.pth
Tokenizer information saved to: tokenizer_info_cpp_to_ps_bert_20epoch.txt


In [7]:
!pip install transformers



In [6]:
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import time
from transformers import BertTokenizer

# --- Testing Code ---

# --- Load Tokenizer and Model Info ---
MODEL_SAVE_PATH = 'cpp_to_pseudocode_model_bert_20epoch.pth'
TOKENIZER_INFO_PATH = "tokenizer_info_cpp_to_ps_bert_20epoch.txt"

def load_model_and_tokenizer(model_path, tokenizer_info_path):
    """Loads the trained model, tokenizer, and related information."""
    checkpoint = torch.load(model_path, map_location=torch.device('cpu')) # Load to CPU first
    model_info = {
        'src_vocab_size': checkpoint['src_vocab_size'],
        'trg_vocab_size': checkpoint['trg_vocab_size'],
        'max_length': checkpoint['max_length'],
        'd_model': checkpoint['d_model'],
        'num_heads': checkpoint['num_heads'],
        'num_layers': checkpoint['num_layers'],
        'd_ff': checkpoint['d_ff'],
        'dropout': checkpoint['dropout']
    }

    # Initialize model
    # Corrected class name here: Transformers -> Transformer
    model = Transformer(
        src_vocab_size=model_info['src_vocab_size'],
        trg_vocab_size=model_info['trg_vocab_size'],
        d_model=model_info['d_model'],
        num_heads=model_info['num_heads'],
        num_layers=model_info['num_layers'],
        d_ff=model_info['d_ff'],
        dropout=model_info['dropout'],
        max_len=model_info['max_length'] # Pass max_len to the model init
    )
    model.load_state_dict(checkpoint['model_state_dict']) # Load model weights

    # Load tokenizer info (for verification, though BERT tokenizer is fixed)
    with open(tokenizer_info_path, "r") as f:
        tokenizer_lines = f.readlines()
        cpp_vocab_size_loaded = int(tokenizer_lines[0].split(":")[1].strip())
        ps_vocab_size_loaded = int(tokenizer_lines[1].split(":")[1].strip())
        tokenizer_type_loaded = tokenizer_lines[2].split(":")[1].strip()

    # Load BERT tokenizer (ensure it's the same as training)
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    PAD_TOKEN = bert_tokenizer.pad_token
    UNK_TOKEN = bert_tokenizer.unk_token
    BOS_TOKEN = bert_tokenizer.cls_token
    EOS_TOKEN = bert_tokenizer.sep_token

    print("Loaded Tokenizer Info:")
    print(f"C++ vocab size: {cpp_vocab_size_loaded}")
    print(f"Pseudocode vocab size: {ps_vocab_size_loaded}")
    print(f"Tokenizer type: {tokenizer_type_loaded}")

    return model, bert_tokenizer, bert_tokenizer, model_info['max_length'] # Return model, cpp_tokenizer, ps_tokenizer, max_len

# --- Tokenize and Numericalize for Testing ---
def tokenize_and_numericalize_test_bert(text, tokenizer, max_len=128):
    """Tokenizes, numericalizes, and pads a single text sequence for testing using BERT tokenizer."""
    encoded_text = tokenizer.encode(
        BOS_TOKEN + " " + text + " " + EOS_TOKEN,
        add_special_tokens=False,
        max_length=max_len,
        truncation=True,
        padding='max_length'
    )
    return torch.tensor(encoded_text).unsqueeze(0) # Add batch dimension


# --- Translation / Decoding Function ---
def translate_code_to_pseudocode_bert(model, src_sequence, max_len, device, tokenizer):
    """Translates C++ code to pseudocode using the trained Transformer model and BERT tokenizer."""
    model.eval() # Set model to evaluation mode
    src_sequence = src_sequence.to(device)

    src_mask = model.make_src_mask(src_sequence)

    with torch.no_grad():
        enc_output = model.forward(src_sequence, src_sequence)[:, :1, :] # Only encode once, dummy target
    # Initialize target sequence with BOS_TOKEN index
    trg_indexes = [tokenizer.cls_token_id] # Start with BOS token
    for _ in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device) # unsqueeze to add batch dim
        trg_mask = model.make_trg_mask(trg_tensor)

        with torch.no_grad():
            output = model.forward(src_sequence, trg_tensor) # Re-use enc_output, dummy src
        pred_token = output.argmax(2)[:, -1].item() # Get predicted token index from last position

        trg_indexes.append(pred_token)
        if pred_token == tokenizer.sep_token_id: # EOS_TOKEN
            break

    # Decode the numericalized pseudocode sequence back to text
    trg_tokens = tokenizer.convert_ids_to_tokens(trg_indexes)
    # Filter out special tokens and join
    translated_text = tokenizer.convert_tokens_to_string(trg_tokens[1:-1]) # Remove BOS and EOS

    return translated_text



In [32]:

# --- Example Usage and Testing ---
if __name__ == '__main__':
    # --- Device Configuration ---
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device for testing: {device}")

    # Load the trained model and tokenizer
    model, cpp_tokenizer, ps_tokenizer, max_len = load_model_and_tokenizer(MODEL_SAVE_PATH, TOKENIZER_INFO_PATH)
    model = model.to(device) # Move model to the device

    # --- Example C++ Code Snippets for Testing ---
    test_cpp_examples = ["for(int i = 0; i < n; i++) { cout<< ii;}"]

    # --- Translate and Print Results ---
    print("\n--- Testing Translations ---")
    for cpp_code in test_cpp_examples:
        numericalized_cpp = tokenize_and_numericalize_test_bert(cpp_code, cpp_tokenizer, max_len)
        predicted_pseudocode = translate_code_to_pseudocode_bert(model, numericalized_cpp, max_len, device, ps_tokenizer)

        print(f"\nC++ Code:\n`c++\n{cpp_code}\n`")
        print(f"Predicted Pseudocode:\n`pseudocode\n{predicted_pseudocode}\n`")
        print("-" * 50)

    print("\nTesting complete.")

Using device for testing: cuda


  checkpoint = torch.load(model_path, map_location=torch.device('cpu')) # Load to CPU first


Loaded Tokenizer Info:
C++ vocab size: 30522
Pseudocode vocab size: 30522
Tokenizer type: bert-base-uncased

--- Testing Translations ---

C++ Code:
`c++
for(int i = 0; i < n; i++) { cout<< i*i;}
`
Predicted Pseudocode:
`pseudocode
for i = 0 to n exclusive , print * i
`
--------------------------------------------------

Testing complete.
