In [2]:
pip install rouge-score

Defaulting to user installation because normal site-packages is not writeable
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=189c965516eebf3968644e8f09694a4ae699ed5c64617ded324c0f9e2225be2e
  Stored in directory: /student/mjr175/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer

# Load the dataset
dataset_path = "/student/mjr175/commentGeneration/Train_4_Lang/Java1000Train.csv"
df = pd.read_csv(dataset_path)

# Split dataset into train and validation sets
train_size = int(len(df) * 0.8)  # 80% of the data for training
val_size = len(df) - train_size  # Remaining 20% for validation
train_dataset, val_dataset = random_split(df, [train_size, val_size])
train_df = pd.DataFrame(train_dataset.dataset)
val_df = pd.DataFrame(val_dataset.dataset)

# Define T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

2024-03-08 11:59:00.113512: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-08 11:59:00.160413: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 11:59:00.160466: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 11:59:00.162293: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-08 11:59:00.172515: I tensorflow/core/platform/cpu_feature_guar

In [2]:
len(val_df)

1000

In [39]:


# Define dataset class
class GPT2HumanDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        gpt_comment = self.data.iloc[idx]["gptComment"]
        human_comment = self.data.iloc[idx]["groundTruth"]

        input_text = f"generate human like comment from gptComment: {gpt_comment}"
        target_text = human_comment

        input_ids = self.tokenizer.encode(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        labels = self.tokenizer.encode(target_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": input_ids.squeeze(),
            "attention_mask": input_ids != tokenizer.pad_token_id,
            "labels": labels.squeeze()
        }

def evaluate_rouge(model, val_loader):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}
    num_samples = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Generate predictions
            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, num_beams=4, early_stopping=True)
            predicted_sentences = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

            # Compute ROUGE scores
            for ref, hyp in zip(labels, predicted_sentences):
                scores = scorer.score(hyp, tokenizer.decode(ref.cpu(), skip_special_tokens=True, clean_up_tokenization_spaces=True))
                for rouge_type in rouge_scores.keys():
                    rouge_scores[rouge_type] += scores[rouge_type].fmeasure
                num_samples += 1

    # Calculate average ROUGE scores
    for rouge_type in rouge_scores.keys():
        rouge_scores[rouge_type] /= num_samples

    return rouge_scores

# Hyperparameters
batch_size = 8
max_length = 128
learning_rate = 1e-4
num_epochs = 15

# DataLoader
train_loader = DataLoader(GPT2HumanDataset(train_df, tokenizer, max_length), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(GPT2HumanDataset(val_df, tokenizer, max_length), batch_size=batch_size, shuffle=False)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"epoch {epoch+1}, Batch loss: {loss.item()}")
    # Evaluate on validation set
rouge_scores = evaluate_rouge(model, val_loader)
print(f"Epoch {epoch+1}/{num_epochs}, Rouge Score: {rouge_scores}, Batch loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("codeNarrator_T5")
tokenizer.save_pretrained("codeNarrator_T5")


epoch 1, Batch loss: 0.46594423055648804
epoch 1, Batch loss: 0.44376102089881897
epoch 1, Batch loss: 0.47994065284729004
epoch 1, Batch loss: 0.6689395904541016
epoch 1, Batch loss: 0.5314613580703735
epoch 1, Batch loss: 0.36245205998420715
epoch 1, Batch loss: 0.4528369605541229
epoch 1, Batch loss: 0.5547585487365723
epoch 1, Batch loss: 0.4454065263271332
epoch 1, Batch loss: 0.3706100881099701
epoch 1, Batch loss: 0.4585029184818268
epoch 1, Batch loss: 0.3009800910949707
epoch 1, Batch loss: 0.5574549436569214
epoch 1, Batch loss: 0.8279182314872742
epoch 1, Batch loss: 0.5838012099266052
epoch 1, Batch loss: 0.3659141957759857
epoch 1, Batch loss: 0.23990868031978607
epoch 1, Batch loss: 0.40476977825164795
epoch 1, Batch loss: 0.803613007068634
epoch 1, Batch loss: 0.47445544600486755
epoch 1, Batch loss: 0.3535246253013611
epoch 1, Batch loss: 0.37251442670822144
epoch 1, Batch loss: 0.3674231171607971
epoch 1, Batch loss: 0.7227097153663635
epoch 1, Batch loss: 0.4441815018

('codeNarrator_T5/tokenizer_config.json',
 'codeNarrator_T5/special_tokens_map.json',
 'codeNarrator_T5/spiece.model',
 'codeNarrator_T5/added_tokens.json')

In [31]:

test_csv_file = "/student/mjr175/commentGeneration/Test_From_Asap/outputJava250 (copy).csv"
output_csv_file = "/student/mjr175/commentGeneration/Test_From_Asap/outputJava250_predicted.csv"

# Load the fine-tuned model and tokenizer
fine_tuned_model = T5ForConditionalGeneration.from_pretrained("fine_tuned_t5")
fine_tuned_tokenizer = T5Tokenizer.from_pretrained("codeNarrator_T5")

def generateOutput(test_sentence):
    inputs = fine_tuned_tokenizer.encode_plus(
        test_sentence,
        return_tensors="pt"
    )
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    fine_tuned_model.to(device)
    output_ids = fine_tuned_model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=128,  # Adjust the max_length as needed
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    return fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)




# Load test data from CSV file
test_data = pd.read_csv(test_csv_file)

# Generate outputs and save in another CSV file
predicted_comments = []
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
for index, row in test_data.iterrows():
    input_comment = row["gptComment"]
    output_comment = generateOutput(input_comment)
    predicted_comments.append(output_comment)
    reference_comment = row["groundTruth"]
    scores = scorer.score(reference_comment, output_comment)
    for metric, score in scores.items():
        rouge_scores[metric].append(score.fmeasure)
# Add predicted comments to the test data and save to output CSV file
average_rouge_scores = {metric: sum(scores) / len(scores) for metric, scores in rouge_scores.items()}

test_data["Predicted_Comment"] = predicted_comments
print(average_rouge_scores) 
test_data.to_csv(output_csv_file, index=False)


{'rouge1': 0.0909090909090909, 'rouge2': 0.0, 'rougeL': 0.06060606060606061}
