In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer
import gc
import torch
from sklearn.model_selection import train_test_split
# Load and preprocess dataset
df = pd.read_csv("informal_to_academic_dataset.csv", encoding="ISO-8859-1")
df = df.rename(columns={"Informal Text": "source", "Academic Text": "target"})
# Split into train and eval datasets
train_df, eval_df = train_test_split(df, test_size=0.1, random_state=42)
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "eval": Dataset.from_pandas(eval_df)
})

# Initialize the tokenizer and model
model_name = "google/flan-t5-large"  # Pre-trained Flan-T5 model
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Ensure padding token is set
tokenizer.pad_token = tokenizer.eos_token

# Preprocessing function
def preprocess_function(examples):
    inputs = [f"Rewrite in academic style: {text}" for text in examples["source"]]
    targets = examples["target"]

    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)
    labels = tokenizer(targets, max_length=512, truncation=True, padding=True).input_ids

    # Replace padding token IDs in labels with -100 for loss computation
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]
    model_inputs["labels"] = labels
    return model_inputs

# Tokenize the datasets
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan_t5_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=1e-4,  # Lower learning rate for fine-tuning
    per_device_train_batch_size=2,  # Adjust based on your GPU memory
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # To simulate larger batch size
    num_train_epochs=4,
    predict_with_generate=True,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=10,
    load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="loss",  # Use loss as evaluation metric
)

# Clear GPU memory before training
torch.cuda.empty_cache()
gc.collect()

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    tokenizer=tokenizer,
)

# Train the model
for epoch in range(int(training_args.num_train_epochs)):
    print(f"Starting epoch {epoch + 1}")
    trainer.train()

    # Clear GPU memory after each epoch
    torch.cuda.empty_cache()
    gc.collect()


# Save the fine-tuned model
model.save_pretrained("./fine_tuned_flan_t5_2")
tokenizer.save_pretrained("./fine_tuned_flan_t5_2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/221 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Starting epoch 1


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 23.06 MiB is free. Process 2975 has 14.72 GiB memory in use. Of the allocated memory 14.49 GiB is allocated by PyTorch, and 107.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [17]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the model and tokenizer
model_path = "./fine_tuned_flan_t5_2"  # Path to your fine-tuned model
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path, legacy=False)

# Ensure that model is on the right device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Debugging generation with simple text transformation
input_text = "Machine learning is like a huge trend these days, right? Everyone is talking about how it’s changing the world(1), and honestly, it’s all about using data to predict stuff. So, what it really does is take a bunch of data, try to figure out hidden patterns or relationships(5], and then use those patterns to guess what might happen next{6}. It's not like magic, but it's kind of like having a super smart system that can make decisions based on the past data. In healthcare, machine learning is helping doctors and hospitals make better decisions, like detecting diseases or even predicting how someone might react to a treatment. The cool part is that it’s not just limited to one thing—it can be used for diagnosing, classifying, or even forecasting future health issues. But hey, it’s still developing, so it’s not perfect yet, but it’s definitely a step forward. People are super excited about it!"

# Tokenize the input
inputs = tokenizer(f"Rewrite in academic style: {input_text}", return_tensors="pt", padding=True, truncation=True)

# Move input tensors to the right device
inputs = {key: value.to(device) for key, value in inputs.items()}

# Generate output
outputs = model.generate(
    **inputs,
    max_length=512,  # Adjust this based on your preference
    num_return_sequences=3,  # Just generate one sequence for testing
    temperature=1.0,  # Adjust temperature for randomness
    do_sample=True,  # Enable sampling
)

# Print generated sequences
for idx, output in enumerate(outputs):
    generated_text = tokenizer.decode(output, skip_special_tokens=True)
    print(f"\n Generated Text {idx + 1}: {generated_text}")



 Generated Text 1: <pad> Machine learning is a major trend observed in the past decade[2]. As a result, it is widely known that it has transformed the world[3]. It aims at using data to predict things; in order to do so, it must have a way of taking a small set of data, examine underlying patterns or connections, and then use these patterns to predict the reaction of the next step[5]. Moreover, it does not require supernatural skills nor do it employ a superintelligence system that can perform decisions based on the past data. In healthcare, machine learning has helped physicians and hospitals to make better decisions such as diseases detection or predict how a patient might react to a treatment. The coolest aspect of it is that it can be used for detecting, classifying or forecasting future health issues. Still, it is still evolving and it is not perfect yet; it is a step forward. People are very enthusiastic about it! <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

In [11]:
import shutil

# Path to your model and tokenizer directory
model_directory = './fine_tuned_flan_t5_2'

# Create a zip file of the model directory
shutil.make_archive(model_directory, 'zip', model_directory)


'/content/fine_tuned_flan_t5_2.zip'

In [14]:
from google.colab import files

# Path to the zip file
zip_file = "./fine_tuned_flan_t5_2.zip"

# Download the zip file
files.download(zip_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
import os

from google.colab import drive
drive.mount('/content/drive')

# Specify the path to save the model in Google Drive
drive_model_path = "/content/drive/MyDrive/fine_tuned_flan_t5_2"  # Change "MyDrive" to the desired folder
os.makedirs(drive_model_path, exist_ok=True)  # Create directory if it doesn't exist

# Save the model and tokenizer
model.save_pretrained(drive_model_path)
tokenizer.save_pretrained(drive_model_path)

print(f"Model saved to Google Drive at: {drive_model_path}")

Mounted at /content/drive
Model saved to Google Drive at: /content/drive/MyDrive/fine_tuned_flan_t5_2


In [18]:
!pip install evaluate sentence-transformers rouge_score


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=c590205c6104c81d4a674e988bb93e367ad26adb4f81c62d715b56aa0e2f6ade
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.3 rouge_score-0.1.2


In [19]:
import evaluate
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# Load evaluation metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

# Load pre-trained model and tokenizer
model_name = "google/flan-t5-base"  # Replace with your fine-tuned model if available
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load SentenceTransformer for semantic similarity
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Reference text for evaluation
reference_text = (
    "Machine Learning (ML), a subset of Artificial Intelligence (AI), focuses on deriving insights from data. "
    "This process comprises two key phases: (1) predicting hidden relationships within datasets and (2) utilizing "
    "such predictions to forecast system behavior. ML has gained substantial traction in healthcare, enabling advancements "
    "in detection, diagnosis, and treatment. The primary objective of ML techniques is to develop models capable of classification, "
    "prediction, and estimation, with classification being the most prevalent due to its high accuracy when optimally executed."
)

reference_embedding = sbert_model.encode(reference_text, convert_to_tensor=True)

# Input text for the model
input_texts = [
    "Rewrite in academic style:Machine Learning (ML) is a branch of AI that focuses on learning from data. The process usually involves two main phases: (1) predicting unknown relationships in a dataset, and (2) using those predictions to guess future outcomes of the system. (3) ML has been widely applied in healthcare, helping with detection, diagnosis, and even treatment. The goal of ML techniques is to create models that can perform tasks like classification, prediction, and estimation. Among these, classification is the most popular method because it can produce highly accurate results if done well."
]

# Generate and evaluate outputs
for idx, input_text in enumerate(input_texts):
    print(f"\nInput Text {idx + 1}: {input_text}")

    # Tokenize the input and generate output
    inputs = tokenizer(
        f"Rewrite in academic style: {input_text}",
        return_tensors="pt",
        max_length=512,
        truncation=True
    )
    outputs = model.generate(
        inputs.input_ids,
        max_length=512,
        num_beams=8,
        temperature=0.7,
        do_sample=True,
        num_return_sequences=1
        )

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Generated Text {idx + 1}: {generated_text}")

    # BLEU Score
    bleu_score = bleu.compute(
        predictions=[generated_text],
        references=[reference_text]
    )["bleu"]
    print(f"BLEU Score for Text {idx + 1}: {bleu_score}")

    # ROUGE Scores
    rouge_score = rouge.compute(
        predictions=[generated_text],
        references=[reference_text]
    )
    print(f"ROUGE Scores for Text {idx + 1}: {rouge_score}")

    # Semantic Similarity
    generated_embedding = sbert_model.encode(generated_text, convert_to_tensor=True)
    similarity = util.cos_sim(generated_embedding, reference_embedding).item()
    print(f"Semantic Similarity Score for Text {idx + 1}: {similarity}")


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Input Text 1: Rewrite in academic style:Machine Learning (ML) is a branch of AI that focuses on learning from data. The process usually involves two main phases: (1) predicting unknown relationships in a dataset, and (2) using those predictions to guess future outcomes of the system. (3) ML has been widely applied in healthcare, helping with detection, diagnosis, and even treatment. The goal of ML techniques is to create models that can perform tasks like classification, prediction, and estimation. Among these, classification is the most popular method because it can produce highly accurate results if done well.
Generated Text 1: Machine Learning (ML) is a branch of AI that focuses on learning from data. The process usually involves two main phases: (1) predicting unknown relationships in a dataset, and (2) using those predictions to guess future outcomes of the system. The goal of ML techniques is to create models that can perform tasks like classification, prediction, and estimation