Load the dataset and inspect properties

In [1]:
!pip install datasets torch transformers accelerate evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
Collecting accelerate
  Downloading accelerate-1.2.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.1.0.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets

In [2]:
! export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512'

### Dataset metadata

In [3]:
from datasets import load_dataset_builder
ds_builder = load_dataset_builder("google/code_x_glue_ct_code_to_text", "python")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
ds_builder.info.features

{'id': Value(dtype='int32', id=None),
 'repo': Value(dtype='string', id=None),
 'path': Value(dtype='string', id=None),
 'func_name': Value(dtype='string', id=None),
 'original_string': Value(dtype='string', id=None),
 'language': Value(dtype='string', id=None),
 'code': Value(dtype='string', id=None),
 'code_tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'docstring': Value(dtype='string', id=None),
 'docstring_tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'sha': Value(dtype='string', id=None),
 'url': Value(dtype='string', id=None)}

## Load the dataset

Skip examples to make training time feasable on CSU dept machines.

In [5]:
from datasets import load_dataset

dataset = load_dataset("google/code_x_glue_ct_code_to_text", "python")
dataset['train'] = dataset['train'].skip(180000)
dataset['test'] = dataset['test'].skip(7800)

Generating train split: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 251820/251820 [00:09<00:00, 27704.92 examples/s]
Generating validation split: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13914/13914 [00:00<00:00, 24658.34 examples/s]
Generating test split: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [6]:
dataset.shape

{'train': (71820, 12), 'validation': (13914, 12), 'test': (7118, 12)}

In [7]:
dataset.column_names

{'train': ['id',
  'repo',
  'path',
  'func_name',
  'original_string',
  'language',
  'code',
  'code_tokens',
  'docstring',
  'docstring_tokens',
  'sha',
  'url'],
 'validation': ['id',
  'repo',
  'path',
  'func_name',
  'original_string',
  'language',
  'code',
  'code_tokens',
  'docstring',
  'docstring_tokens',
  'sha',
  'url'],
 'test': ['id',
  'repo',
  'path',
  'func_name',
  'original_string',
  'language',
  'code',
  'code_tokens',
  'docstring',
  'docstring_tokens',
  'sha',
  'url']}

In [8]:
dataset["train"][0]

{'id': 180000,
 'repo': 'Tanganelli/CoAPthon3',
 'path': 'coapthon/messages/option.py',
 'func_name': 'Option.length',
 'original_string': 'def length(self):\n        """\n        Return the value length\n\n        :rtype : int\n        """\n        if isinstance(self._value, int):\n            return byte_len(self._value)\n        if self._value is None:\n            return 0\n        return len(self._value)',
 'language': 'python',
 'code': 'def length(self):\n        """\n        Return the value length\n\n        :rtype : int\n        """\n        if isinstance(self._value, int):\n            return byte_len(self._value)\n        if self._value is None:\n            return 0\n        return len(self._value)',
 'code_tokens': ['def',
  'length',
  '(',
  'self',
  ')',
  ':',
  'if',
  'isinstance',
  '(',
  'self',
  '.',
  '_value',
  ',',
  'int',
  ')',
  ':',
  'return',
  'byte_len',
  '(',
  'self',
  '.',
  '_value',
  ')',
  'if',
  'self',
  '.',
  '_value',
  'is',
  'Non

## Create Tokenizer

Roberta Tokenizer creates byte-level Byte-Pair-Encoding, suitable for our base model

In [9]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration

tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
2024-12-06 04:38:04.381554: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


## Preprocessing data

Prefix given to match one of the applications of the pretrained model and improve performance. 
code_tokens used instead of 'code' because the code_tokens do not contain the docstring that the model is trying to generate. The code tokens are instead combined into a string in this step suitable for the model's tokenization process. Use -100 for label on padding tokens so that they do not cause reduced model performance.

In [10]:
prefix = "Summarize Python: "
def preprocess(samples):
    codestrings = samples['code_tokens']
    
    
    docstrings = samples['docstring']

    inputs = []
    for codestring in codestrings:
        codestring = ' '.join(codestring)
        inputs.append(prefix + codestring)
        
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True)
    labels = tokenizer(docstrings, padding="max_length", truncation=True).input_ids

    labels_with_ignore_index = []
    for labels_example in labels:
        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)

    model_inputs["labels"] = labels_with_ignore_index

    return model_inputs

In [11]:
dataset = dataset.map(preprocess, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71820/71820 [00:47<00:00, 1512.51 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13914/13914 [00:10<00:00, 1322.24 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

## Dataset loaders
batch size is sized to not run out of memory on CSU dept machines

In [12]:
from torch.utils.data import DataLoader

dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=16)
valid_dataloader = DataLoader(dataset['validation'], batch_size=8)
test_dataloader = DataLoader(dataset['test'], batch_size=8)

## Training columns
attention_mask is included so that extra padding input_id's are not mistaken as actual input by model

In [13]:
batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


## Example of decoding producing original docstring

In [14]:
labels = batch['labels'][0]
tokenizer.decode([label for label in labels if label != -100])

'<s>Remove a relationship from one user to another, with the same caveats\n        and behavior as adding a relationship.</s>'

## Base pre-trained model

In [15]:
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-small")

In [16]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

## Hypr-parameters
increasing these, particularly num_epochs results in extremely long training time on dept machines.

In [17]:
from transformers import get_scheduler
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

## Backend processing device
Ensure CUDA is used.

In [18]:
import torch
from accelerate.test_utils.testing import get_backend

device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
model.to(device)


T5ForConditionalGeneration(
  (shared): Embedding(32100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

## Training loop

In [19]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|                                                                                                                                                                                                                                                                                                                                                                                                          | 0/4489 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

## Save the trained model for future use and testing

In [25]:
save_directory = "."
model.save_pretrained(save_directory)

## Example of loading in dataset and using model for prediction

In [26]:
dataset = load_dataset("google/code_x_glue_ct_code_to_text", "python")
print(dataset['validation'])

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
    num_rows: 13914
})


In [31]:
test_example = dataset['validation'][32]
print("Code:", test_example['code_tokens'])

Code: ['def', 'wrap_deepmind', '(', 'env', ',', 'episode_life', '=', 'True', ',', 'clip_rewards', '=', 'True', ',', 'frame_stack', '=', 'False', ',', 'scale', '=', 'False', ')', ':', 'if', 'episode_life', ':', 'env', '=', 'EpisodicLifeEnv', '(', 'env', ')', 'if', "'FIRE'", 'in', 'env', '.', 'unwrapped', '.', 'get_action_meanings', '(', ')', ':', 'env', '=', 'FireResetEnv', '(', 'env', ')', 'env', '=', 'WarpFrame', '(', 'env', ')', 'if', 'scale', ':', 'env', '=', 'ScaledFloatFrame', '(', 'env', ')', 'if', 'clip_rewards', ':', 'env', '=', 'ClipRewardEnv', '(', 'env', ')', 'if', 'frame_stack', ':', 'env', '=', 'FrameStack', '(', 'env', ',', '4', ')', 'return', 'env']


In [32]:
model = T5ForConditionalGeneration.from_pretrained(save_directory)

### Ensure that the sample input is the joined tokens

In [34]:
test_ex = ' '.join(test_example['code_tokens'])
input_ids = tokenizer(test_ex, return_tensors='pt').input_ids

outputs = model.generate(input_ids)
print("Generated docstring:", tokenizer.decode(outputs[0], skip_special_tokens=True))

Generated docstring: Wrap an environment into a deepmind environment.

    :param env: The environment


In [35]:
print("True docstring: ", test_example['docstring'])

True docstring:  Configure environment for DeepMind-style Atari.


Simple Evaluation

In [None]:
def simple_evaluation(model, tokenizer, validation_dataset, num_samples=100):
    """
    Accuracy is measured using token overlap it will create a token for each word for the reference the test and then tokenize
    the generated and then comparing each other to get the score. 

    as well it will generate a extact one.


    Evaluate the model's performance using token overlap metrics.
    Args:
    - model: Trained T5 model.
    - tokenizer: Tokenizer for the model.
    - validation_dataset: Validation dataset containing 'code_tokens' and 'docstring'.
    - num_samples: Number of samples to evaluate (default is 100).

    Returns:
    - accuracy: Proportion of exact matches between predictions and references.
    - avg_overlap: Average token overlap percentage.
    """

    total_overlap = 0
    exact_matches = 0
    total_samples = min(num_samples, len(validation_dataset))

    for i in range(total_samples):
        example = validation_dataset[i]

        # Prepare input
        input_text = " ".join(example["code_tokens"])
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids

        # Generate prediction
        outputs = model.generate(input_ids)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Get reference
        reference = example["docstring"]

        # Token overlap
        pred_tokens = set(generated_text.split())
        ref_tokens = set(reference.split())
        overlap = len(pred_tokens & ref_tokens) / len(ref_tokens) if ref_tokens else 0
        total_overlap += overlap

        # Exact match
        if generated_text.strip() == reference.strip():
            exact_matches += 1

    accuracy = exact_matches / total_samples
    avg_overlap = total_overlap / total_samples

    return accuracy, avg_overlap



validation_dataset = dataset["validation"][:100]  # Load a subset for faster evaluation
accuracy, avg_overlap = simple_evaluation(model, tokenizer, validation_dataset)
print(f"Accuracy (Exact Matches): {accuracy * 100:.2f}%")
print(f"Average Token Overlap: {avg_overlap * 100:.2f}%")

Adding Rouge Evaluation

In [None]:
from datasets import load_metric
from transformers import T5Tokenizer

# Load ROUGE metric
rouge_metric = load_metric("rouge")

def evaluate_rouge(model, tokenizer, validation_dataset, num_samples=100):
    """

    Accuracy: Measures overlap of n-grams and longest common subsequences (LCS) between generated and reference texts.
    Variants like ROUGE-1, ROUGE-2, and ROUGE-L capture different levels of granularity:
    ROUGE-1: Unigram overlap.
    ROUGE-2: Bigram overlap.
    ROUGE-L: Longest common subsequence, which captures sentence structure.
    ROUGE-L will reward sequences even if the prediction misses some intermediate words.

    Evaluate the model using ROUGE scores.

    Args:
    - model: Trained T5 model.
    - tokenizer: Tokenizer for the model.
    - validation_dataset: Dataset containing 'code_tokens' and 'docstring'.
    - num_samples: Number of samples to evaluate.

    Returns:
    - rouge_scores: Dictionary with average ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L).
    """
    predictions = []
    references = []

    # Evaluate on a subset of the validation data
    total_samples = min(num_samples, len(validation_dataset))
    for i in range(total_samples):
        example = validation_dataset[i]

        # Prepare input text
        input_text = " ".join(example["code_tokens"])
        input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids

        # Generate prediction
        outputs = model.generate(input_ids)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Append results
        predictions.append(generated_text)
        references.append(example["docstring"])

    # Compute ROUGE
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)

    # Extract relevant ROUGE scores
    rouge_scores = {
        "ROUGE-1": rouge_results["rouge1"].mid.fmeasure,
        "ROUGE-2": rouge_results["rouge2"].mid.fmeasure,
        "ROUGE-L": rouge_results["rougeL"].mid.fmeasure
    }

    return rouge_scores

validation_dataset = dataset["validation"][:100] 
rouge_scores = evaluate_rouge(model, tokenizer, validation_dataset)
print("ROUGE Evaluation Scores:")
print(f"ROUGE-1: {rouge_scores['ROUGE-1']:.4f}")
print(f"ROUGE-2: {rouge_scores['ROUGE-2']:.4f}")
print(f"ROUGE-L: {rouge_scores['ROUGE-L']:.4f}")


adding Bleu Evalutation Method


In [None]:
from datasets import load_metric
from transformers import T5Tokenizer

# Load BLEU metric
bleu_metric = load_metric("bleu")

def evaluate_bleu(model, tokenizer, validation_dataset, num_samples=100):
    """
    Evaluate the model using BLEU scores. 

    Args:
    - model: Trained T5 model.
    - tokenizer: Tokenizer for the model.
    - validation_dataset: Dataset containing 'code_tokens' and 'docstring'.
    - num_samples: Number of samples to evaluate.

    Returns:
    - bleu_score: Average BLEU score for the evaluated samples.
    """
    predictions = []
    references = []

    total_samples = min(num_samples, len(validation_dataset))
    for i in range(total_samples):
        example = validation_dataset[i]

        # Prepare input text
        input_text = " ".join(example["code_tokens"])
        input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids

        # Generate prediction
        outputs = model.generate(input_ids)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Append results
        predictions.append(generated_text.split())  # Tokenize the generated text
        references.append([example["docstring"].split()])  # BLEU expects list of references

    # Compute BLEU
    bleu_results = bleu_metric.compute(predictions=predictions, references=references)

    return bleu_results["bleu"]

validation_dataset = dataset["validation"][:100]  # Use a subset for faster evaluation
bleu_score = evaluate_bleu(model, tokenizer, validation_dataset)
print(f"BLEU Score: {bleu_score:.4f}")
