In [2]:
pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Obtaining dependency information for sentencepiece from https://files.pythonhosted.org/packages/fb/12/2f5c8d4764b00033cf1c935b702d3bb878d10be9f0b87f0253495832d85f/sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
import pandas as pd  # Add this line to import pandas
# Define your CSV file path
csv_file = "/student/mjr175/commentGeneration/Train_4_Lang/Java1000Train.csv"


# Create a custom dataset class
class SentenceConversionDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        gptComment, groundTruth = self.data.loc[idx, "gptComment"], self.data.loc[idx, "groundTruth"]
        inputs = self.tokenizer.encode_plus(
            f"convert gptComment to groundTruth: {gptComment}",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        labels = self.tokenizer.encode(
            groundTruth,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels.squeeze()
        }

# Load T5 model and tokenizer
model_name = "t5-small"  # or "t5-large", "t5-3b", "t5-11b", "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Define your batch size and maximum sequence length
batch_size = 8
max_length = 256

# Create DataLoader for training
train_dataset = SentenceConversionDataset(csv_file, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=1e-4)

# Fine-tuning loop
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.cuda.empty_cache()
model.train()

for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}, Batch loss: {loss.item()}")


# Save the fine-tuned model
model.save_pretrained("fine_tuned_t5")
tokenizer.save_pretrained("fine_tuned_t5")




Epoch 1, Batch loss: 19.879329681396484
Epoch 1, Batch loss: 15.554410934448242
Epoch 1, Batch loss: 13.111258506774902
Epoch 1, Batch loss: 9.686079978942871
Epoch 1, Batch loss: 10.359190940856934
Epoch 1, Batch loss: 9.0573148727417
Epoch 1, Batch loss: 5.73008918762207
Epoch 1, Batch loss: 5.000347137451172
Epoch 1, Batch loss: 4.214096546173096
Epoch 1, Batch loss: 3.1553401947021484
Epoch 1, Batch loss: 2.944056272506714
Epoch 1, Batch loss: 2.4298644065856934
Epoch 1, Batch loss: 2.574411630630493
Epoch 1, Batch loss: 1.813261866569519
Epoch 1, Batch loss: 1.8675488233566284
Epoch 1, Batch loss: 1.7692523002624512
Epoch 1, Batch loss: 1.605228304862976
Epoch 1, Batch loss: 1.7078768014907837
Epoch 1, Batch loss: 1.843479871749878
Epoch 1, Batch loss: 1.6410244703292847
Epoch 1, Batch loss: 1.4822709560394287
Epoch 1, Batch loss: 1.6922506093978882
Epoch 1, Batch loss: 1.5480772256851196
Epoch 1, Batch loss: 1.287927508354187
Epoch 1, Batch loss: 1.2570878267288208
Epoch 1, Batch

('fine_tuned_t5/tokenizer_config.json',
 'fine_tuned_t5/special_tokens_map.json',
 'fine_tuned_t5/spiece.model',
 'fine_tuned_t5/added_tokens.json')

In [3]:
# Load the fine-tuned model and tokenizer
fine_tuned_model = T5ForConditionalGeneration.from_pretrained("fine_tuned_t5")
fine_tuned_tokenizer = T5Tokenizer.from_pretrained("fine_tuned_t5")

# Test the model
test_sentence = "This function sends an HTTP DELETE request to the specified URI, retrieves the response, and returns it, ensuring proper resource cleanup after execution."
inputs = fine_tuned_tokenizer.encode_plus(
    f"convert gptComment to humanLikeComment: {test_sentence}",
    return_tensors="pt"
)
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

# Ensure tensors are on the same device as the model (cuda or cpu)
fine_tuned_model.to(device)

output_ids = fine_tuned_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=50,  # Adjust the max_length as needed
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    early_stopping=True
)

decoded_output = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Input sentence:", test_sentence)
print("Generated humanLike comment:", decoded_output)


Input sentence: This function sends an HTTP DELETE request to the specified URI, retrieves the response, and returns it, ensuring proper resource cleanup after execution.
Generated humanLike comment: In this case, the HTTP DELETE request is sent to the specified URI.
