<div style="text-align: center; font-size:20px; color:Green;">
    Transformer Based Model
</div>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CommitBench dataset
commitbench_df = pd.read_csv('C:/Users/salij/Desktop/THESIS/commitbench.csv')

# Extract necessary columns for training ('diff' as input, 'message' as target)
commitbench_df = commitbench_df[['diff', 'message']]

# Reduce the dataset to 1/8 of the original size
commitbench_df = commitbench_df.sample(frac=0.125, random_state=42)

# Split into training and validation sets (10% validation)
train_bench, val_bench = train_test_split(commitbench_df, test_size=0.1, random_state=42)

# Save preprocessed data
train_bench.to_csv('train_bench.csv', index=False)
val_bench.to_csv('val_bench.csv', index=False)


In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset

In [3]:
class CommitMessageDataset(Dataset):
    def __init__(self, data, tokenizer, source_max_len, target_max_len):
        self.diff = data['diff'].tolist()  # Extract 'diff' column as a list
        self.message = data['message'].tolist()  # Extract 'message' column as a list
        self.tokenizer = tokenizer
        self.source_max_len = source_max_len
        self.target_max_len = target_max_len

    def __len__(self):
        return len(self.diff)  # Return the length of the dataset

    def __getitem__(self, index):
        source_text = str(self.diff[index])
        target_text = str(self.message[index])

        # Tokenize the source and target text
        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.target_max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'source_ids': source['input_ids'].squeeze(),
            'source_mask': source['attention_mask'].squeeze(),
            'target_ids': target['input_ids'].squeeze(),
        }


In [4]:
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Set max token lengths
source_max_len = 128
target_max_len = 128

In [5]:
# Convert DataFrames into Dataset objects
train_dataset = CommitMessageDataset(train_bench, tokenizer, source_max_len, target_max_len)
val_dataset = CommitMessageDataset(val_bench, tokenizer, source_max_len, target_max_len)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [6]:
from transformers import AdamW

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Training function
def train_model(model, train_loader, val_loader, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['source_ids'].to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
            target_ids = batch['target_ids'].to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

            # Forward pass
            outputs = model(input_ids=input_ids, labels=target_ids)
            loss = outputs.loss

            # Backward pass
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}")




In [None]:
train_model(model, train_loader, val_loader, epochs=3)