In [None]:
pip install datasets evaluate transformers rouge-score nltk

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading eva

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Then you need to install Git-LFS. Uncomment the following instructions:

In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version:

In [None]:
import transformers

print(transformers.__version__)

4.44.2


In [None]:
model_checkpoint = "t5-small"

This notebook is built to run  with any model checkpoint from the [Model Hub](https://huggingface.co/models) as long as that model has a sequence-to-sequence version in the Transformers library. Here we picked the [`t5-small`](https://huggingface.co/t5-small) checkpoint.

## Loading the dataset

In [None]:
import pandas as pd
from evaluate import load
data = pd.read_excel('/content/PROJECT_DATA.xlsx')

# Load the ROUGE metric
metric = load("rouge")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
data.head(2)

Unnamed: 0,SNO.,MCV,MCHC,HB,RBC,WBC,PLT,RDWCV,NEUTRO,LYMPHO,SUMMARY
0,Report1,87.7,30.1,7.3,2.77,10.0,189.0,11.4,50.1,43.2,Your report shows a hemoglobin level of 7.3 g/...
1,Report2,88.2,20.2,7.3,2.84,10.0,180.0,11.4,52.3,42.4,"In your CBC report, your hemoglobin level is 7..."


In [None]:
# Check column names
print(list(data.columns))


['SNO.', 'MCV ', 'MCHC ', 'HB ', 'RBC ', 'WBC ', 'PLT', 'RDWCV', 'NEUTRO', 'LYMPHO', 'SUMMARY ']


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(data)


In [None]:
def preprocess_function(examples):
    # Concatenate the features into a single string for the input
    input_texts = [
        " ".join([
            str(examples['MCV '][i]),
            str(examples['MCHC '][i]),
            str(examples['HB '][i]),
            str(examples['RBC '][i]),
            str(examples['WBC '][i]),
            str(examples['PLT'][i]),
            str(examples['RDWCV'][i]),
            str(examples['NEUTRO'][i]),
            str(examples['LYMPHO'][i])
        ])
        for i in range(len(examples['MCV ']))  # Iterate over the batch
    ]

    # Tokenize the inputs
    model_inputs = tokenizer(input_texts, max_length=1024, padding="max_length", truncation=True)

    # Tokenize the targets (adjust to the correct column name)
    labels = tokenizer(examples['SUMMARY '], max_length=200, padding="max_length", truncation=True)

    # Add labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
print(data.columns)

Index(['SNO.', 'MCV ', 'MCHC ', 'HB ', 'RBC ', 'WBC ', 'PLT', 'RDWCV',
       'NEUTRO', 'LYMPHO', 'SUMMARY '],
      dtype='object')


In [None]:
# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Train/Test Split using Hugging Face's `train_test_split`
train_test_split_datasets = tokenized_datasets.train_test_split(test_size=0.2)

# Extract train and test datasets
train_dataset = train_test_split_datasets['train']
test_dataset = train_test_split_datasets['test']


Map:   0%|          | 0/282 [00:00<?, ? examples/s]

In [None]:
from transformers import T5ForConditionalGeneration

model_checkpoint = "t5-small"  # Or other T5 models you prefer
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
!pip install evaluate
import nltk
import numpy as np
import evaluate

# Load ROUGE metric
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels as they cannot be decoded
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE score
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)

    result = {key: value * 100 for key, value in result.items()}

    # Compute the average generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}




In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
import nltk
import pickle

nltk.download('punkt')
nltk.download('all')

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Save model here
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    predict_with_generate=True,
    fp16=True,
    save_total_limit=3
)

# Data collator for Seq2Seq tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Ensure this is adjusted for your task
)

# Start training
trainer.train()

# Save the trained model and tokenizer
model.save_pretrained("./results/trained_model")  # Save model in HuggingFace format
tokenizer.save_pretrained("./results/trained_model")  # Save tokenizer

# Optional: Save model as a .pkl file (this is not standard for Hugging Face models)
with open("./results/trained_model/model.pkl", "wb") as f:
    pickle.dump(model, f)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downlo

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,4.672013,3.2611,1.0811,2.9197,3.3014,4.8772
2,No log,3.2381,2.953,1.0473,2.5488,2.9681,3.5789
3,No log,2.770674,2.2575,0.8383,1.966,2.3656,3.0
4,5.153500,2.391119,0.7553,0.3299,0.6131,0.7553,0.6667
5,5.153500,2.157582,0.0,0.0,0.0,0.0,0.0
6,5.153500,1.965529,0.0,0.0,0.0,0.0,0.0
7,2.634700,1.785369,0.0,0.0,0.0,0.0,0.0
8,2.634700,1.635868,0.0484,0.0,0.0484,0.0484,0.6667
9,2.634700,1.525814,0.0,0.0,0.0,0.0,1.0
10,2.634700,1.439612,0.5259,0.1542,0.3598,0.4026,3.0




In [None]:
# Save the model locally after training
model.save_pretrained('C:\\Users\\sunil\\Desktop\\MODEL3')

# Save the tokenizer (if needed)
tokenizer.save_pretrained("C:\\Users\\sunil\\Desktop\\MODEL3")


('C:\\Users\\sunil\\Desktop\\MODEL3/tokenizer_config.json',
 'C:\\Users\\sunil\\Desktop\\MODEL3/special_tokens_map.json',
 'C:\\Users\\sunil\\Desktop\\MODEL3/spiece.model',
 'C:\\Users\\sunil\\Desktop\\MODEL3/added_tokens.json',
 'C:\\Users\\sunil\\Desktop\\MODEL3/tokenizer.json')

In [None]:
import torch
import pickle

# Define your save path
save_path = "C:\\Users\\sunil\\Desktop\\MODEL3"

# Save the Hugging Face model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

# Save the model as a .pkl file
pkl_save_path = f"{save_path}\\model.pkl"
with open(pkl_save_path, "wb") as f:
    pickle.dump(model, f)

# Verify if the model is saved correctly
print(f"Model saved in Hugging Face format at: {save_path}")
print(f"Model also saved as .pkl file at: {pkl_save_path}")


Model saved in Hugging Face format at: C:\Users\sunil\Desktop\MODEL3
Model also saved as .pkl file at: C:\Users\sunil\Desktop\MODEL3\model.pkl


In [None]:
import torch
import pickle

# Define your save path
save_path = "C:\\Users\\sunil\\Desktop\\MODEL3"

# Save the Hugging Face model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

# Save the model's state_dict as a .pt file
pt_save_path = f"{save_path}\\model.pt"
torch.save(model.state_dict(), pt_save_path)

# Optionally, you can also save the tokenizer separately as a pickle file
pkl_save_path = f"{save_path}\\model.pkl"
with open(pkl_save_path, "wb") as f:
    pickle.dump(model, f)

# Verify if the model is saved correctly
print(f"Model saved in Hugging Face format at: {save_path}")
print(f"Model saved as .pt file at: {pt_save_path}")
print(f"Model also saved as .pkl file at: {pkl_save_path}")


Model saved in Hugging Face format at: C:\Users\sunil\Desktop\MODEL3
Model saved as .pt file at: C:\Users\sunil\Desktop\MODEL3\model.pt
Model also saved as .pkl file at: C:\Users\sunil\Desktop\MODEL3\model.pkl


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load trained model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained('C:\\Users\\sunil\\Desktop\\MODEL3')
tokenizer = AutoTokenizer.from_pretrained('C:\\Users\\sunil\\Desktop\\MODEL3')

# Sample test data
test_data = ["86.5	30.1	1.8	4.15	10.9	158	12.8	71.9	18.1	"]

# Tokenize test data
inputs = tokenizer(test_data, return_tensors="pt", padding=True, truncation=True)

# Use the correct 'input_ids' from the tokenized data
generated_output = model.generate(
    input_ids=inputs['input_ids'],  # Corrected to use inputs['input_ids']
    max_length=150,  # Set maximum length for the generation
    max_new_tokens=400,  # This limits only the number of new tokens generated
    num_beams=4,  # Optional: Set beam search for higher quality output
    early_stopping=True  # Optional: Stop generation if the model thinks it's done
)

# Decode the generated output
generated_text = tokenizer.decode(generated_output[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)


Both `max_new_tokens` (=400) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


The report shows a hemoglobin level of 1.8 g/dL and an RBC count of 4.15 million cells/L, which is normal, with neutrophils at 71.9% and lymphocytes at 11%. The platelet count of 10.9 thousand cells/L is normal, with neutrophils at 71.9% and lymphocytes at 11%. The platelet count of 158 thousand cells/L is normal, with neutrophils at 71.9% and lymphocytes at 10.9 reflects a balanced immune system. Overall, this profile suggests mild anemia.
