<div style="text-align: center; font-size:20px; color:Green;">
    RNN-Based model
</div>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CommitBench dataset
commitbench_df = pd.read_csv('C:/Users/salij/Desktop/THESIS/commitbench.csv')

# Extract necessary columns for training ('diff' as input, 'message' as target)
commitbench_df = commitbench_df[['diff', 'message']]

# Use 1/4 of the dataset
commitbench_df = commitbench_df.sample(frac=0.125, random_state=42)

# Split into training and validation sets
train_bench, val_bench = train_test_split(commitbench_df, test_size=0.1, random_state=42)

# Save preprocessed data
train_bench.to_csv('train_bench.csv', index=False)
val_bench.to_csv('val_bench.csv', index=False)


In [2]:
from collections import Counter
import torch

# Define tokenizer
def tokenize(text):
    return text.split()

# Build vocabulary
def build_vocab(text_data):
    counter = Counter()
    for sentence in text_data:
        counter.update(tokenize(sentence))
    vocab = {word: idx for idx, (word, _) in enumerate(counter.most_common())}
    vocab['<unk>'] = len(vocab)
    vocab['<pad>'] = len(vocab)
    return vocab

# Build vocabularies
diff_vocab = build_vocab(train_bench['diff'])
message_vocab = build_vocab(train_bench['message'])

# Tokenize and numericalize
def numericalize(vocab, text):
    return torch.tensor([vocab.get(token, vocab['<unk>']) for token in tokenize(text)], dtype=torch.long)

# Apply to training and validation data
train_bench['diff'] = train_bench['diff'].apply(lambda x: numericalize(diff_vocab, x))
train_bench['message'] = train_bench['message'].apply(lambda x: numericalize(message_vocab, x))
val_bench['diff'] = val_bench['diff'].apply(lambda x: numericalize(diff_vocab, x))
val_bench['message'] = val_bench['message'].apply(lambda x: numericalize(message_vocab, x))

# Save preprocessed data
train_bench.to_csv('train_bench.csv', index=False)
val_bench.to_csv('val_bench.csv', index=False)


In [3]:
from torch.utils.data import Dataset, DataLoader

class CommitDataset(Dataset):
    def __init__(self, diffs, messages):
        self.diffs = diffs
        self.messages = messages

    def __len__(self):
        return len(self.diffs)

    def __getitem__(self, idx):
        return self.diffs[idx], self.messages[idx]

def collate_batch(batch):
    diffs, messages = zip(*batch)
    diff_lengths = [len(diff) for diff in diffs]
    message_lengths = [len(message) for message in messages]
    max_diff_length = max(diff_lengths)
    max_message_length = max(message_lengths)

    diff_padded = torch.zeros((len(diffs), max_diff_length), dtype=torch.long)
    message_padded = torch.zeros((len(messages), max_message_length), dtype=torch.long)

    for i, (diff, message) in enumerate(zip(diffs, messages)):
        diff_padded[i, :len(diff)] = diff
        message_padded[i, :len(message)] = message

    return diff_padded, message_padded

# Convert lists of tensors to Dataset
train_dataset = CommitDataset(train_bench['diff'].tolist(), train_bench['message'].tolist())
val_dataset = CommitDataset(val_bench['diff'].tolist(), val_bench['message'].tolist())

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)


In [4]:
import torch
import torch.nn as nn

class Seq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, hidden_dim):
        super(Seq2Seq, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, src, trg):
        embedded_src = self.embedding(src)
        encoder_output, (hidden, cell) = self.encoder(embedded_src)
        embedded_trg = self.embedding(trg)
        decoder_output, _ = self.decoder(embedded_trg, (hidden, cell))
        output = self.fc(decoder_output)
        return output

# Define model parameters
input_dim = len(diff_vocab)  # Vocabulary size for 'diff'
output_dim = len(message_vocab)  # Vocabulary size for 'message'
emb_dim = 256
hidden_dim = 512

# Instantiate the model
model = Seq2Seq(input_dim, output_dim, emb_dim, hidden_dim)


In [5]:
import torch.optim as optim
import torch.nn.functional as F

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=diff_vocab['<pad>'])  # Use appropriate ignore_index

def train_model(model, train_loader, val_loader, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for src, trg in train_loader:
            optimizer.zero_grad()
            output = model(src, trg[:, :-1])
            output_dim = output.shape[-1]
            trg = trg[:, 1:].contiguous().view(-1)
            output = output.view(-1, output_dim)
            loss = criterion(output, trg)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}')

    print("Training complete")

train_model(model, train_loader, val_loader, num_epochs=10)


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1598945280 bytes.