In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets transformers
!pip install contractions

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import pandas as pd
import re
import contractions
from datasets import load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, TrainingArguments, Trainer
import torch

In [None]:
# === Data Cleaning Functions ===
def clean_text(text):
    # Remove special characters and extra spaces, convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

def remove_urls(text):
    return re.sub(r'http\S+|www\S+|@\S+', '', text)

def expand_contractions(text):
    return contractions.fix(text)

In [None]:
import pandas as pd

train_path = '/content/drive/MyDrive/train.csv'
test_path = '/content/drive/MyDrive/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.head())

train_df['Body_question'] = train_df['Body_question'].apply(lambda x: clean_text(remove_urls(expand_contractions(x))))
train_df['Body_answer'] = train_df['Body_answer'].apply(lambda x: clean_text(remove_urls(expand_contractions(x))))

test_df['Body_question'] = test_df['Body_question'].apply(lambda x: clean_text(remove_urls(expand_contractions(x))))
test_df['Body_answer'] = test_df['Body_answer'].apply(lambda x: clean_text(remove_urls(expand_contractions(x))))

   Unnamed: 0                                      Body_question  \
0        6604  I'm new to machine learning and I try to creat...   
1         106  I'm using Neural Networks to solve different M...   
2        2993  I have a training set composed of images havin...   
3       10766  I have encountered a strange situation where t...   
4        4315  I have trained a CNN model and I have applied ...   

                                         Body_answer  
0  You have two separate problems going on.\n\nUs...  
1  UPDATE: the landscape has changed quite a bit ...  
2  I have a suggestion for you. Maybe not complet...  
3  If I am getting you right, you are trying to p...  
4  Do you only have one single model? If you were...  


In [None]:
# load data
dataset = load_dataset(
    'csv',
    data_files={
        'train': '/content/drive/MyDrive/train.csv',
        'test': '/content/drive/MyDrive/test.csv'
    }
)
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Body_question', 'Body_answer'],
        num_rows: 9751
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Body_question', 'Body_answer'],
        num_rows: 2438
    })
})


In [None]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

In [None]:
# preprocess function
def preprocess_batch(examples):
    inputs = ["question: " + question for question in examples['Body_question']]
    targets = examples['Body_answer']
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
    labels = tokenizer(targets, truncation=True, padding="max_length", max_length=512)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset.map(preprocess_batch, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/9751 [00:00<?, ? examples/s]

Map:   0%|          | 0/2438 [00:00<?, ? examples/s]

In [None]:
# load model 
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="max_length",
    max_length=512
)

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
from transformers import EarlyStoppingCallback

output_dir = '/content/drive/My Drive/DSAN6600_final'

# define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=10,  
    weight_decay=0.01,  
    logging_dir=f'{output_dir}/logs',
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    logging_steps=500,
    eval_steps=500,
    label_smoothing_factor=0.1,  
    gradient_checkpointing=True  
)

# define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'].with_format('torch'),
    eval_dataset=tokenized_datasets['test'].with_format('torch'),
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Add Early Stopping
)

trainer.train()

# evaluate
results = trainer.evaluate()
print("Evaluation Results:", results)

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,2.8401,2.777918
2,2.782,2.732677
3,2.7097,2.708926
4,2.661,2.692534
5,2.6411,2.682976
6,2.6027,2.677666
7,2.5974,2.674316
8,2.5634,2.67257
9,2.5622,2.671779
10,2.5391,2.671485


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Evaluation Results: {'eval_loss': 2.6714847087860107, 'eval_runtime': 33.1917, 'eval_samples_per_second': 73.452, 'eval_steps_per_second': 18.378, 'epoch': 10.0}


In [None]:
# generation
def generate_output(prompt, model, tokenizer):
    # tokenize the input prompt
    input_ids = tokenizer("question: " + prompt, return_tensors="pt").input_ids.to(device)

    # generate the output with diverse decoding
    # topk + top sampling
    outputs = model.generate(
        input_ids=input_ids,
        max_length=256,
        do_sample=True,           
        top_k=30,                 
        top_p=0.9,                
        temperature=0.7,          
        repetition_penalty=1.5,   
        no_repeat_ngram_size=3   
    )

    # decode the generated output
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# prompt 
prompt = "Write a Python function to add two numbers."
generated_code = generate_output(prompt, model, tokenizer)
print("Generated Code:")
print(generated_code)

Generated Code:
In Python, you can do the following:
from scipy import optimize as np
import numpy as np # This will return a single integer with 0 and 1. If not there is any mathematical relationship between two numbers then it's possible that both are positive or negative (or something like this) to be represented by an int32 value. 

This would give us how much of your function works?  It may make sense to use python functions for adding values in some way but insteadof using strings directly from NumPy arrays which have more than 2 integers per iteration - e., g.: if len(a &gt;2): print('Adding': ',add_1), add_2([a, b]) else : pass

