**Attempt 2**

In [1]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.42.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.2-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m88.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.42.2


In [1]:
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import time
import streamlit as st
from transformers import BertTokenizer  # Import BERT tokenizer

ModuleNotFoundError: No module named 'pandas'

In [4]:


# --- Data Loading and Preprocessing ---
def load_spoc_data(file_path, text_col='text', code_col='code'):
    """Loads SPoC data from a TSV file and returns text and code pairs."""
    df = pd.read_csv(file_path, sep='\t')
    # Filter out rows where either 'text' or 'code' is NaN
    df_clean = df.dropna(subset=[text_col, code_col])
    return df_clean[text_col].tolist(), df_clean[code_col].tolist()

# Load BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Special tokens from BERT tokenizer (no need to define our own UNK, PAD, BOS, EOS explicitly now)
PAD_TOKEN = bert_tokenizer.pad_token
UNK_TOKEN = bert_tokenizer.unk_token
BOS_TOKEN = bert_tokenizer.cls_token  # Using BERT's CLS token as BOS - you can adjust if needed
EOS_TOKEN = bert_tokenizer.sep_token  # Using BERT's SEP token as EOS - you can adjust if needed


def tokenize_and_numericalize_bert(texts, codes, tokenizer, max_len=128):
    """Tokenizes, numericalizes, and pads text and code sequences using BERT tokenizer."""
    text_sequences_numericalized = []
    code_sequences_numericalized = []

    for text, code in zip(texts, codes):
        # Tokenize and numericalize text using BERT tokenizer
        encoded_text = tokenizer.encode(
            BOS_TOKEN + " " + text + " " + EOS_TOKEN, # Add BOS and EOS tokens
            add_special_tokens=False, # already added manually
            max_length=max_len,
            truncation=True,
            padding='max_length' # Pad to max_length
        )
        encoded_code = tokenizer.encode(
            BOS_TOKEN + " " + code + " " + EOS_TOKEN, # Add BOS and EOS tokens
            add_special_tokens=False, # already added manually
            max_length=max_len,
            truncation=True,
            padding='max_length' # Pad to max_length
        )

        text_sequences_numericalized.append(torch.tensor(encoded_text))
        code_sequences_numericalized.append(torch.tensor(encoded_code))


    text_sequences_padded = torch.stack(text_sequences_numericalized) # Stacking to create tensor batch
    code_sequences_padded = torch.stack(code_sequences_numericalized) # Stacking to create tensor batch


    return text_sequences_padded, code_sequences_padded

class SPOCDataset(Dataset):
    def __init__(self, text_sequences, code_sequences):
        self.text_sequences = text_sequences
        self.code_sequences = code_sequences

    def __len__(self):
        return len(self.text_sequences)

    def __getitem__(self, idx):
        return self.text_sequences[idx], self.code_sequences[idx]


# --- Main Data Preparation ---
train_file_path = '/kaggle/input/spoc-train-tsv/spoc-train.tsv' # Assuming dataset is in 'train' directory
train_texts, train_codes = load_spoc_data(train_file_path)


# Numericalize and pad the actual training data using BERT tokenizer
MAX_SEQ_LEN = 128 # Define a maximum sequence length
train_text_numericalized, train_code_numericalized = tokenize_and_numericalize_bert(
    train_texts, train_codes, bert_tokenizer, max_len=MAX_SEQ_LEN
)

# Create Dataset and DataLoader
train_dataset = SPOCDataset(train_text_numericalized, train_code_numericalized)
BATCH_SIZE = 128 # Define batch size
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)


# --- Vocabulary Sizes ---
TEXT_VOCAB_SIZE = bert_tokenizer.vocab_size
CODE_VOCAB_SIZE = bert_tokenizer.vocab_size # BERT tokenizer is shared - you might use separate if needed, or different BERT models
# In this case, we are using the same tokenizer for both source and target vocab, which is a simplification and might work for pseudocode to code.

print(f"Text Vocabulary Size (BERT): {TEXT_VOCAB_SIZE}")
print(f"Code Vocabulary Size (BERT): {CODE_VOCAB_SIZE}")
print(f"Number of training examples: {len(train_dataset)}")

# Example of data from DataLoader
text_batch, code_batch = next(iter(train_dataloader))
print("Example text batch shape:", text_batch.shape) # [Batch_size, Seq_len]
print("Example code batch shape:", code_batch.shape) # [Batch_size, Seq_len]


# --- Placeholders for Tokenizers and Vocabs for later use ---
ar_tokenizer = bert_tokenizer # Using BERT tokenizer for pseudocode
en_tokenizer = bert_tokenizer # Using BERT tokenizer for C++ code (you could use a different one if desired)
ar_vocab_size = TEXT_VOCAB_SIZE
en_vocab_size = CODE_VOCAB_SIZE


# --- Device Configuration ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


# --- Transformer Model Implementation (Reference Code - Corrected _init_ to __init__) ---
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=256, num_heads=8,
                 num_layers=3, d_ff=512, dropout=0.1, max_len=128):
        super(Transformer, self).__init__()

        # Embedding layers
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)
        self.dropout = nn.Dropout(dropout)

        # Positional encoding
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

        # Encoder and decoder layers
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])

        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])

        # Output layer
        self.output_layer = nn.Linear(d_model, trg_vocab_size)

        # Other parameters
        self.scale = math.sqrt(d_model)
        self.src_pad_idx = bert_tokenizer.pad_token_id # Use BERT pad token id
        self.trg_pad_idx = bert_tokenizer.pad_token_id # Use BERT pad token id


    def make_src_mask(self, src):
        # Create mask for padding in source (1 for tokens, 0 for padding)
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        # Create mask for padding and look-ahead
        trg_len = trg.shape[1]

        # Padding mask (1 for tokens, 0 for padding)
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

        # Look-ahead mask (lower triangular matrix)
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=trg.device)).bool()
        trg_sub_mask = trg_sub_mask.unsqueeze(0).unsqueeze(0)

        # Combine masks
        trg_mask = trg_pad_mask & trg_sub_mask

        return trg_mask

    def forward(self, src, trg):
        # Create masks
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        # Source embedding and positional encoding
        src = self.src_embedding(src) * self.scale
        src = src + self.pe[:, :src.size(1)].to(src.device)
        src = self.dropout(src)

        # Encoder
        enc_output = src
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        # Target embedding and positional encoding
        trg = self.trg_embedding(trg) * self.scale
        trg = trg + self.pe[:, :trg.size(1)].to(trg.device)
        trg = self.dropout(trg)

        # Decoder
        dec_output = trg
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, trg_mask, src_mask)

        # Output
        output = self.output_layer(dec_output)

        return output

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        assert d_model % num_heads == 0

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)

        self.fc = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim]))

    def forward(self, q, k, v, mask=None):
        batch_size = q.shape[0]

        # Linear projections and reshape
        q = self.q_linear(q).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        k = self.k_linear(k).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        v = self.v_linear(v).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        # Attention
        energy = torch.matmul(q, k.permute(0, 1, 3, 2)) / self.scale.to(q.device)

        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)

        attention = self.dropout(F.softmax(energy, dim=-1))
        output = torch.matmul(attention, v)

        # Reshape and concat heads
        output = output.permute(0, 2, 1, 3).contiguous()
        output = output.view(batch_size, -1, self.d_model)

        # Final linear projection
        output = self.fc(output)

        return output

# Position-wise Feed Forward Network
class PositionwiseFeedforward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedforward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.fc2(self.dropout(F.relu(self.fc1(x))))

# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # Self-attention with residual connection and layer norm
        attn_out = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_out))

        # Feed forward with residual connection and layer norm
        ff_out = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_out))

        return x

# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.enc_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, trg_mask, src_mask):
        # Self-attention with residual connection and layer norm
        self_attn_out = self.self_attn(x, x, x, trg_mask)
        x = self.norm1(x + self.dropout(self_attn_out))

        # Encoder-decoder attention with residual connection and layer norm
        enc_attn_out = self.enc_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(enc_attn_out))

        # Feed forward with residual connection and layer norm
        ff_out = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_out))

        return x


# --- Model Initialization ---
model = Transformer(
    src_vocab_size=ar_vocab_size,
    trg_vocab_size=en_vocab_size,
    d_model=256,
    num_heads=8,
    num_layers=3,
    d_ff=512,
    dropout=0.1
).to(device)

print(f'Model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad):,} parameters')
# --- Training Setup ---
LEARNING_RATE = 0.0001  # You can tune this
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=ar_tokenizer.pad_token_id) # or en_tokenizer.pad_token_id - they are the same

NUM_EPOCHS = 20 # <--- SPECIFY NUMBER OF EPOCHS HERE

def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train() # Set model to training mode
    epoch_loss = 0
    for batch_idx, (src, trg) in enumerate(dataloader):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad() # Clear gradients

        output = model(src, trg[:, :-1]) # Don't feed EOS token of target to decoder input, output shape [batch_size, trg_len-1, trg_vocab_size]

        # Reshape for loss calculation
        output_reshape = output.contiguous().view(-1, output.shape[-1]) # [batch_size * (trg_len-1), trg_vocab_size]
        trg_reshape = trg[:, 1:].contiguous().view(-1) # # [batch_size * (trg_len-1)] - shifted target, don't include BOS token in target for loss

        loss = criterion(output_reshape, trg_reshape)
        loss.backward() # Backpropagation
        optimizer.step() # Update weights

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader) # Average loss per batch

def train(model, train_dataloader, optimizer, criterion, num_epochs, device):
    for epoch in range(num_epochs):
        start_time = time.time()
        train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device)
        end_time = time.time()
        epoch_mins = int((end_time - start_time) / 60)
        epoch_secs = int((end_time - start_time) - (epoch_mins * 60))

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')



Text Vocabulary Size (BERT): 30522
Code Vocabulary Size (BERT): 30522
Number of training examples: 216225
Example text batch shape: torch.Size([128, 128])
Example code batch shape: torch.Size([128, 128])
Using device: cuda
Model initialized with 27,425,082 parameters


In [5]:
# --- Run Training ---
train(model, train_dataloader, optimizer, criterion, NUM_EPOCHS, device)

Epoch: 01 | Time: 8m 37s
	Train Loss: 1.610 | Train PPL:   5.002
Epoch: 02 | Time: 8m 38s
	Train Loss: 0.670 | Train PPL:   1.955
Epoch: 03 | Time: 8m 39s
	Train Loss: 0.528 | Train PPL:   1.695
Epoch: 04 | Time: 8m 39s
	Train Loss: 0.449 | Train PPL:   1.566
Epoch: 05 | Time: 8m 39s
	Train Loss: 0.396 | Train PPL:   1.486
Epoch: 06 | Time: 8m 39s
	Train Loss: 0.358 | Train PPL:   1.430
Epoch: 07 | Time: 8m 38s
	Train Loss: 0.329 | Train PPL:   1.390
Epoch: 08 | Time: 8m 38s
	Train Loss: 0.307 | Train PPL:   1.360
Epoch: 09 | Time: 8m 38s
	Train Loss: 0.289 | Train PPL:   1.336
Epoch: 10 | Time: 8m 38s
	Train Loss: 0.274 | Train PPL:   1.315
Epoch: 11 | Time: 8m 39s
	Train Loss: 0.261 | Train PPL:   1.298
Epoch: 12 | Time: 8m 38s
	Train Loss: 0.250 | Train PPL:   1.284
Epoch: 13 | Time: 8m 37s
	Train Loss: 0.241 | Train PPL:   1.272
Epoch: 14 | Time: 8m 37s
	Train Loss: 0.232 | Train PPL:   1.262
Epoch: 15 | Time: 8m 37s
	Train Loss: 0.225 | Train PPL:   1.253
Epoch: 16 | Time: 8m 38s


In [6]:
# Function to save the trained model
def save_model(model, path="spoc_transformer_1.pth"):
    torch.save(model.state_dict(), path)
    print(f"Model saved at {path}")

# Call this function after training
save_model(model)


Model saved at spoc_transformer_1.pth


In [7]:
# Function to load the trained model
def load_model(model, path="spoc_transformer_1.pth"):
    model.load_state_dict(torch.load(path))
    model.eval()
    print(f"Model loaded from {path}")
    return model

# Load the trained model
model = Transformer(src_vocab_size=TEXT_VOCAB_SIZE, trg_vocab_size=CODE_VOCAB_SIZE)
model.to(device)
model = load_model(model)


Model loaded from spoc_transformer_1.pth


  model.load_state_dict(torch.load(path))


In [8]:
def generate_code(model, tokenizer, input_text, max_len=128):
    model.eval()
    with torch.no_grad():
        # Tokenize input
        input_ids = tokenizer.encode(BOS_TOKEN + " " + input_text + " " + EOS_TOKEN, 
                                     add_special_tokens=False, 
                                     max_length=max_len, 
                                     truncation=True, 
                                     padding='max_length')
        input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)  # Add batch dimension
        
        # Start with <BOS> token
        output_tokens = [tokenizer.cls_token_id]  # Using BERT CLS as BOS
        for _ in range(max_len):
            output_tensor = torch.tensor(output_tokens).unsqueeze(0).to(device)
            predictions = model(input_tensor, output_tensor)
            next_token = predictions.argmax(dim=-1)[:, -1].item()  # Get most probable token
            
            if next_token == tokenizer.sep_token_id:  # Stop if <EOS> is generated
                break

            output_tokens.append(next_token)

        # Convert token IDs back to text
        generated_code = tokenizer.decode(output_tokens[1:], skip_special_tokens=True)  # Skip BOS token
        return generated_code




In [66]:
# Example Test
test_pseudocode = "for i from 1 to n: print i*i"
generated_cpp_code = generate_code(model, bert_tokenizer, test_pseudocode)
print("Generated C++ Code:\n", generated_cpp_code)

Generated C++ Code:
 for ( int i = 1 ; i < = n ; i + + ) cout < < i * i < < endl ;


In [51]:
# Example Test
test_pseudocode = "for i from 1 to 10 print name array i index"
generated_cpp_code = generate_code(model, bert_tokenizer, test_pseudocode)
print("Generated C++ Code:\n", generated_cpp_code)

Generated C++ Code:
 for ( int i = 1 ; i < = 10 ; i + + ) { cout < < name [ i ] < < endl ; }


In [75]:
# Example Test
test_pseudocode = "function max(a, b): if a > b then return a else return b"
generated_cpp_code = generate_code(model, bert_tokenizer, test_pseudocode)
print("Generated C++ Code:\n", generated_cpp_code)x

Generated C++ Code:
 void max ( int a, b ) {
