In [1]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-any.

In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [9]:
from datasets import load_dataset
from transformers import BartForConditionalGeneration, BartTokenizer
from rouge import Rouge
import random

# Load test dataset
test_dataset = load_dataset("code_x_glue_ct_code_to_text", "python", split="test")

# Shuffle the dataset
test_dataset = test_dataset.shuffle(seed=42)

# Select the first 1000 data points
test_dataset = test_dataset.select(range(100))

# Load fine-tuned model and tokenizer
model = BartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/finetuned_model")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Initialize Rouge
rouge = Rouge()

# Evaluate on the test dataset
references = []
predictions = []

for example in test_dataset:
    code = example["code"]
    docstring = example["docstring"]

    # Tokenize input for this particular example
    inputs = tokenizer(f"code: {code}", return_tensors="pt", max_length=512, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs.input_ids.to(model.device), num_beams=4, max_length=128, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Append reference and prediction for ROUGE calculation
    references.append(docstring)
    predictions.append(summary)

# Calculate ROUGE scores
rouge_scores = rouge.get_scores(predictions, references, avg=True)

# Format and print ROUGE scores
formatted_scores = "\n".join([f"{metric}: {score}" for metric, score in rouge_scores.items()])
print("ROUGE scores:")
print(formatted_scores)


ROUGE scores:
rouge-1: {'r': 0.658703392198503, 'p': 0.9824118011618013, 'f': 0.72996800280963}
rouge-2: {'r': 0.6242197470439058, 'p': 0.9766361273135468, 'f': 0.6907215548955585}
rouge-l: {'r': 0.658703392198503, 'p': 0.9824118011618013, 'f': 0.72996800280963}


In [12]:
from datasets import load_dataset
from transformers import BartForConditionalGeneration, BartTokenizer
from rouge import Rouge
import random

# Load test dataset
test_dataset = load_dataset("code_search_net", "python", split="test")

# Shuffle the dataset
test_dataset = test_dataset.shuffle(seed=42)

# Select the first 1000 data points
test_dataset = test_dataset.select(range(100))

# Load fine-tuned model and tokenizer
model = BartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/finetuned_model")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Initialize Rouge
rouge = Rouge()

# Evaluate on the test dataset
references = []
predictions = []

for example in test_dataset:
    code = example["func_code_string"]
    docstring = example["func_documentation_string"]

    # Tokenize input for this particular example
    inputs = tokenizer(f"code: {code}", return_tensors="pt", max_length=512, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs.input_ids.to(model.device), num_beams=4, max_length=128, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Append reference and prediction for ROUGE calculation
    references.append(docstring)
    predictions.append(summary)

# Calculate ROUGE scores
rouge_scores = rouge.get_scores(predictions, references, avg=True)

# Format and print ROUGE scores
formatted_scores = "\n".join([f"{metric}: {score}" for metric, score in rouge_scores.items()])
print("ROUGE scores:")
print(formatted_scores)


ROUGE scores:
rouge-1: {'r': 0.6238417906335681, 'p': 0.9763567364534065, 'f': 0.697041148411199}
rouge-2: {'r': 0.5893947903515329, 'p': 0.968195302289356, 'f': 0.6583109449495957}
rouge-l: {'r': 0.6238417906335681, 'p': 0.9763567364534065, 'f': 0.697041148411199}
