In [None]:
pip install datasets transformers accelerate pandas gradio

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting gradio
  Downloading gradio-5.9.0-py3-none-any.whl.metadata (16 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecti

In [None]:
from datasets import load_dataset
import pandas as pd
import re

# Load the GSM8K dataset
dataset = load_dataset("gsm8k", "main")
train_data = dataset["train"].to_pandas()
test_data = dataset["test"].to_pandas()

# Define a function to clean text
def clean_text(text):
    # Remove unusual characters
    text = re.sub(r"[^a-zA-Z0-9\s.,!?;:()]+", "", text)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply text cleaning
train_data["question"] = train_data["question"].apply(clean_text)
train_data["answer"] = train_data["answer"].apply(clean_text)
test_data["question"] = test_data["question"].apply(clean_text)
test_data["answer"] = test_data["answer"].apply(clean_text)

# Optional: Handle class imbalance if specific labels dominate
# For GSM8K, no strict labels, but you can oversample specific cases if needed.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")


def preprocess_function(examples):
    # Concatenate "Solve: " to each question in the batch
    inputs = ["Solve: " + question for question in examples["question"]]
    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples["answer"], max_length=512, truncation=True, padding="max_length")["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

    train_dataset = dataset["train"].map(preprocess_function, batched=True)
    test_dataset = dataset["test"].map(preprocess_function, batched=True)



The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
import torch
from torch.utils.data import Dataset

class MathDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = "Solve: " + self.data.iloc[idx]["question"]
        answer = self.data.iloc[idx]["answer"]

        # Tokenize question and answer
        inputs = self.tokenizer(question, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        labels = self.tokenizer(answer, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")["input_ids"]

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": labels.squeeze(0),
        }

# Create datasets
train_dataset = MathDataset(train_data, tokenizer)
test_dataset = MathDataset(test_data, tokenizer)


In [None]:
from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration

# Load the model
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
    per_device_train_batch_size=4,  # Reduce batch size if memory is an issue
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    fp16=True,  # Mixed precision training to reduce memory usage
    push_to_hub=False
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)



model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(


In [None]:

# Start training
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,0.3959,0.35701
1000,0.3632,0.325177
1500,0.3482,0.307672
2000,0.3176,0.296221
2500,0.2913,0.289105
3000,0.3113,0.282434
3500,0.2877,0.279065
4000,0.2728,0.276222
4500,0.301,0.273524
5000,0.2875,0.272071


Step,Training Loss,Validation Loss
500,0.3959,0.35701
1000,0.3632,0.325177
1500,0.3482,0.307672
2000,0.3176,0.296221
2500,0.2913,0.289105
3000,0.3113,0.282434
3500,0.2877,0.279065
4000,0.2728,0.276222
4500,0.301,0.273524
5000,0.2875,0.272071


TrainOutput(global_step=5607, training_loss=0.3560001044343962, metrics={'train_runtime': 5378.7139, 'train_samples_per_second': 4.168, 'train_steps_per_second': 1.042, 'total_flos': 1.365222694846464e+16, 'train_loss': 0.3560001044343962, 'epoch': 3.0})

In [None]:
import gradio as gr

# Load the fine-tuned model
model.eval()

def solve_math_problem(question):
    input_text = "Solve: " + question
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    output = model.generate(input_ids, max_length=100)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer

# Create the Gradio Interface
interface = gr.Interface(
    fn=solve_math_problem,
    inputs=gr.Textbox(lines=5, placeholder="Enter a math problem here..."),
    outputs="text",
    title="Grade School Math Solver",
    description="Fine-tuned T5 model for solving grade school math problems."
)

# Launch the interface
interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://38fc0e0e50d90b7b97.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


