**Author:** Pratik Vyas

**Classification:** Binary classification ( Cancer , non-Cancer)

**Usecase:** Finetuning LLM (SequenceClassification) 'distilbert-base-uncased'

**Finetuning Metrics comparison:** [Metrics comparison](https://github.com/Git-PratikVyas/Finetuning-LORA/blob/main/CancerClassification/distilbert_Analyse_Cancer_Classifier_finetuning_Result.md)

In [30]:
# Transformers installation
!pip3 install transformers datasets
!pip3 install evaluate
!pip3 install -q -U bitsandbytes


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Load Cancer Data

In [None]:
import zipfile
import pandas as pd
import io
import os


def load_text_files_from_zip_to_dataframe(zip_path, directory_in_zip, **pandas_kwargs):
    """
    Loads text files from a specific directory inside a ZIP archive into a Pandas DataFrame.
    """
    data = []
    filenames = []
    try:
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            for filename in zip_ref.namelist():
                if filename.startswith(directory_in_zip) and filename.endswith(
                    ".txt"
                ):  # Only process text files
                    try:
                        with zip_ref.open(filename) as text_file:
                            content = (
                                text_file.read()
                                .decode("utf-8", errors="ignore")
                                .strip()
                            )  # Read file contents, decode from bytes
                            data.append(content)
                            # Extract the filename without the directory path and extension
                            file_no_path = os.path.basename(filename)
                            file_no_ext, _ = os.path.splitext(file_no_path)
                            filenames.append(file_no_ext)
                    except Exception as e:
                        print(f"Error reading file {filename}: {e}")
                        continue  # Skip to the next file

        if not data:
            print(f"No text files found in directory: {directory_in_zip}")
            return None

        df = pd.DataFrame({"texts": data, "filename": filenames})  # Create DataFrame
        return df

    except FileNotFoundError:
        print(f"Error: ZIP file not found: {zip_path}")
        return None
    except zipfile.BadZipFile as e:
        print(f"Error: Invalid ZIP file: {e}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
zip_path = "Dataset.zip"
cancer_df_row = load_text_files_from_zip_to_dataframe(zip_path, "Dataset/Cancer")
cancer_df_row["label"] = 1

non_cancer_df_row = load_text_files_from_zip_to_dataframe(
    zip_path, "Dataset/Non-Cancer"
)
non_cancer_df_row["label"] = 0

display(cancer_df_row.head())
display(non_cancer_df_row.head())

Unnamed: 0,texts,filename,label
0,<ID:31055803>\nTitle: [Analysis of age-specifi...,31055803,1
1,<ID:31164412>\nTitle: T-Cell Deletion of MyD88...,31164412,1
2,<ID:31094905>\nTitle: MYCN Amplified Relapse F...,31094905,1
3,<ID:31498304>\nTitle: In Vivo Inhibition of Mi...,31498304,1
4,<ID:30897768>\nTitle: Breast Cancer and miR-SN...,30897768,1


Unnamed: 0,texts,filename,label
0,<ID:25486933>\nTitle: Strategy for identifying...,25486933,0
1,<ID:28699658>\nTitle: Telomere length and soma...,28699658,0
2,<ID:31026806>\nTitle: Transcription Factor STA...,31026806,0
3,"<ID:30255984>\nTitle: Genetics, genomics, and ...",30255984,0
4,<ID:26706013>\nTitle: Genotype/Phenotype Corre...,26706013,0


concat cancer_df_row, non_cancer_df_row

In [142]:
data_df = pd.concat([cancer_df_row, non_cancer_df_row], ignore_index=True)
display(data_df)

Unnamed: 0,texts,filename,label
0,<ID:31055803>\nTitle: [Analysis of age-specifi...,31055803,1
1,<ID:31164412>\nTitle: T-Cell Deletion of MyD88...,31164412,1
2,<ID:31094905>\nTitle: MYCN Amplified Relapse F...,31094905,1
3,<ID:31498304>\nTitle: In Vivo Inhibition of Mi...,31498304,1
4,<ID:30897768>\nTitle: Breast Cancer and miR-SN...,30897768,1
...,...,...,...
995,<ID:26095439>\nTitle: Urinary 11beta-PGF2alpha...,26095439,0
996,<ID:24850616>\nTitle: A limited form of proteu...,24850616,0
997,<ID:24402730>\nTitle: Benign mast cell hyperpl...,24402730,0
998,<ID:26513044>\nTitle: Nevus anemicus associate...,26513044,0


split data into train,test,val

In [None]:
from sklearn.model_selection import train_test_split


def split_train_test_val(
    df, train_size=0.6, val_size=0.2, test_size=0.2, random_state=42
):
    """
    Splits a DataFrame into train, test, and validation sets with specified ratios.
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a Pandas DataFrame.")

    if sum([train_size, test_size, val_size]) != 1.0:
        raise ValueError("Train, test, and validation sizes must sum to 1.")

    # First split into training and remaining data
    train_df, remaining_df = train_test_split(
        df, train_size=train_size, random_state=random_state
    )

    # Calculate the proportion for test and validation sets from the remaining data
    remaining_proportion = 1 - train_size
    test_proportion = test_size / remaining_proportion
    val_proportion = val_size / remaining_proportion

    # Split the remaining data into test and validation sets
    test_df, val_df = train_test_split(
        remaining_df, test_size=test_proportion, random_state=random_state
    )

    return train_df, test_df, val_df

In [None]:
train_df, test_df, val_df = split_train_test_val(
    data_df, train_size=0.6, val_size=0.2, test_size=0.2
)


## train_df reset index
train_df.rename(
    columns={"filename": "ID"}, inplace=True
)  # 1. Rename 'filename' column to 'ID'
train_df = train_df.set_index(
    "ID"
)  # 2. Set 'ID' as the index (this will shift current index to the side)
train_df = (
    train_df.reset_index()
)  # 3. Reset the index to remove the old index (the original integer index)

## test_df reset index
test_df.rename(
    columns={"filename": "ID"}, inplace=True
)  # 1. Rename 'filename' column to 'ID'
test_df = test_df.set_index(
    "ID"
)  # 2. Set 'ID' as the index (this will shift current index to the side)
test_df = (
    test_df.reset_index()
)  # 3. Reset the index to remove the old index (the original integer index)

## val_df reset index
val_df.rename(
    columns={"filename": "ID"}, inplace=True
)  # 1. Rename 'filename' column to 'ID'
val_df = val_df.set_index(
    "ID"
)  # 2. Set 'ID' as the index (this will shift current index to the side)
val_df = (
    val_df.reset_index()
)  # 3. Reset the index to remove the old index (the original integer index)

print("Cancer Train DataFrame:")
display(train_df.head())
print(f"Train shape: {train_df.shape}")

print("\nCancer Test DataFrame:")
display(test_df.head())
print(f"Test shape: {test_df.shape}")

print("\nCancer Validation DataFrame:")
display(val_df.head())
print(f"Validation shape: {val_df.shape}")

Cancer Train DataFrame:


Unnamed: 0,ID,texts,label
0,31186051,<ID:31186051>\nTitle: Diverse regulatory manne...,1
1,31681608,<ID:31681608>\nTitle: Epigenetic Targets in Sy...,1
2,27558955,<ID:27558955>\nTitle: Molecular hydrogen suppr...,0
3,29930110,<ID:29930110>\nTitle: Analysis of shared herit...,0
4,36928589,<ID:36928589>\nTitle: Synthetic multiantigen M...,0


Train shape: (600, 3)

Cancer Test DataFrame:


Unnamed: 0,ID,texts,label
0,29728436,<ID:29728436>\nTitle: Splenic cyst and its man...,0
1,24906339,<ID:24906339>\nTitle: Tracking histone variant...,0
2,31602389,<ID:31602389>\nTitle: Molecular Approaches to ...,1
3,31244296,<ID:31244296>\nTitle: Identification of FLT3 a...,1
4,36610490,<ID:36610490>\nTitle: Phase I Study: Safety an...,1


Test shape: (200, 3)

Cancer Validation DataFrame:


Unnamed: 0,ID,texts,label
0,38345107,<ID:38345107>\nTitle: Similarities and differe...,0
1,24777453,<ID:24777453>\nTitle: Rare and low-frequency c...,0
2,30973434,<ID:30973434>\nTitle: The Effect of Inhaler Ar...,1
3,31177124,<ID:31177124>\nTitle: Paraoxonase 1 (PON1) Q19...,1
4,31347028,<ID:31347028>\nTitle: Clinical Evaluation of I...,1


Validation shape: (200, 3)


preprocess text

In [None]:
# Function to perform the text processing

import re


def process_text(row, isTestSetOrValSet):
    # Compile the regex pattern (case-insensitive)
    abstract_pattern = re.compile(
        r"^\s*Abstract:\s*", re.IGNORECASE | re.MULTILINE
    )  # Precompile regex

    if not bool(abstract_pattern.search(row["texts"])):
        text = "Missing Abstract"
        return text

    # Remove identifiers such as 'ID:', 'Title:', 'Abstract:'
    # normalized_text = re.sub(r"(ID:|Title:|Abstract:)", "", row['texts'], flags=re.IGNORECASE)
    normalized_text = re.sub(
        r"<(?:ID:\d+)>|Title:|Abstract:", "", row["texts"], flags=re.IGNORECASE
    )

    # Standardize common abbreviations
    normalized_text = re.sub(
        r"\bet al\.\b", "and others", normalized_text, flags=re.IGNORECASE
    )
    normalized_text = re.sub(
        r"\bvol\.\b", "volume", normalized_text, flags=re.IGNORECASE
    )

    # Remove common punctuation marks, but leave periods at the end of sentences
    normalized_text = re.sub(r"[,;@#$%^&*()]", "", normalized_text)

    # Ensure consistent spacing, replaces multiple spaces with single ones, and adds space after periods.
    normalized_text = " ".join(normalized_text.split())
    normalized_text = re.sub(r"\.(?=[A-Z])", ". ", normalized_text)

    return normalized_text

In [None]:
# Apply the processing function with the param isTestSetOrValSet
train_df["text"] = train_df.apply(
    lambda row: process_text(row, isTestSetOrValSet=False), axis=1
)
train_df.drop(columns=["texts"], inplace=True)  # Drop the original 'texts' column

test_df["text"] = test_df.apply(
    lambda row: process_text(row, isTestSetOrValSet=True), axis=1
)
test_df.drop(columns=["texts"], inplace=True)  # Drop the original 'texts' column

val_df["text"] = val_df.apply(
    lambda row: process_text(row, isTestSetOrValSet=True), axis=1
)
val_df.drop(columns=["texts"], inplace=True)  # Drop the original 'texts' column

In [147]:
train_df.head()

Unnamed: 0,ID,label,text
0,31186051,1,Diverse regulatory manners of human telomerase...
1,31681608,1,Epigenetic Targets in Synovial Sarcoma: A Mini...
2,27558955,0,Molecular hydrogen suppresses activated Wnt/be...
3,29930110,0,Analysis of shared heritability in common diso...
4,36928589,0,Synthetic multiantigen MVA vaccine COH04S1 and...


convert into dictionary

In [None]:
train_df_dict = train_df.to_dict(orient="list")
test_df_dict = test_df.to_dict(orient="list")
val_df_dict = val_df.to_dict(orient="list")

In [None]:
from datasets import Dataset
import pandas as pd

# Dataset with values as dictionary of col , val
training_dataset = Dataset.from_pandas(pd.DataFrame(train_df_dict))
test_dataset = Dataset.from_pandas(pd.DataFrame(test_df_dict))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_df_dict))

# Removes the ID
training_dataset = training_dataset.remove_columns(["ID"])  # Removes the ID
test_dataset = test_dataset.remove_columns(["ID"])  # Removes the ID
val_dataset = val_dataset.remove_columns(["ID"])  # Removes the ID

In [150]:
training_dataset[0]

{'label': 1,
 'text': 'Diverse regulatory manners of human telomerase reverse transcriptase. Human telomerase reverse transcriptase hTERT is the core subunit of human telomerase and plays important roles in human cancers. Aberrant expression of hTERT is closely associated with tumorigenesis cancer cell stemness maintaining cell proliferation apoptosis inhibition senescence evasion and metastasis. The molecular basis of hTERT regulation is highly complicated and consists of various layers. A deep and full-scale comprehension of the regulatory mechanisms of hTERT is pivotal in understanding the pathogenesis and searching for therapeutic approaches. In this review we summarize the recent advances regarding the diverse regulatory mechanisms of hTERT including the transcriptional promoter mutation promoter region methylation and histone acetylation post-transcriptional mRNA alternative splicing and non-coding RNAs and post-translational levels phosphorylation and ubiquitination which may pr

There are two fields in this dataset:

- `text`: cancer text.
- `label`: a value that is either `0` for non-cancer or `1` for cancer

## Preprocess

The next step is to load a DistilBERT tokenizer to preprocess the `text` field:

In [None]:
import os

model_id = "distilbert-base-uncased"

os.environ["HF_TOKEN"] = "HF_TOKEN"
os.environ["WB_KEY"] = "WB_KEY"

In [152]:
# integrate Weights & Biases (W&B) with training process for tracking, monitoring, and collaboration
import os
import wandb

wandb.login(key=os.environ["WB_KEY"])
run = wandb.init(
    project="cancer_classification_bert",
    job_type="training",
    anonymous="allow",
)



0,1
eval/accuracy,▁▆▁▄▄▅▆████
eval/loss,█▅██▇▇▇▅▃▁▁
eval/runtime,█▁▁▄▄▄▄▄▄▄▄
eval/samples_per_second,▅▅▁▆▇██▅▄▄▆
eval/steps_per_second,▁█▇▂▂▃▃▂▂▂▂
train/epoch,█████▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆███▁▁▂▂▃▃▃▄▄▅▅▅▆▇▇██
train/global_step,▆▆▁▁▁▁▁▂▂▃▃▄▄▄▅▅▅▆▆▆▇███▁▂▂▃▃▄▅▅▅▆▆▇▇▇██
train/grad_norm,▃▁▁▅▁▃▁▁▂▃▃▂▄▃▂▅▂▇▆▃▃▃▆▅▇▅▇▇▅▅▅▇▅▆▇▅█▆█
train/learning_rate,███▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▁▁██▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▁▁
train/loss,████▇▇██▇█▇▇▇▇▇▇▇▆▇▇▆▆▅▅▄▄▄▄▃▂▃▂▃▄▂▂▁▁▃

0,1
eval/accuracy,0.95
eval/loss,0.31324
eval/runtime,83.5359
eval/samples_per_second,1.197
eval/steps_per_second,0.084
total_flos,39254850768912.0
train/epoch,1.0
train/global_step,19.0
train/grad_norm,3.08052
train/learning_rate,0.0


# Load Tokenizer and Model

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_id, use_auth_token=os.environ["HF_TOKEN"]
)



Create a preprocessing function to tokenize `text` and truncate sequences to be no longer than DistilBERT's maximum input length:

In [156]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

To apply the preprocessing function over the entire dataset,   

speed up `map` by setting `batched=True` to process multiple elements of the dataset at once:

In [157]:
tokenized_training_dataset = training_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

create a batch of examples using DataCollatorWithPadding

It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [159]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

evaluate during training

In [161]:
import evaluate

accuracy = evaluate.load("accuracy")

Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`:

In [165]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

Load model with **AutoModelForSequenceClassification** along with the number of expected labels, and the label mappings:

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    use_auth_token=os.environ["HF_TOKEN"],
)



# Model evaluation before fine-tuning

In [None]:
from transformers import pipeline, logging
import torch
from tqdm import tqdm


def predict(test, model, tokenizer):
    y_pred = []
    y_score = []
    y_score_for_cancer = []
    y_score_for_noncancer = []

    # Assuming 'test' is a list or iterable that you're looping through
    total_iterations = len(test)
    update_interval = max(1, total_iterations // 5)  # Update every 20% (minimum 1)

    logging.set_verbosity_error()  # Suppress warnings and informational messages

    classifier = pipeline(task="text-classification", model=model, tokenizer=tokenizer)

    with tqdm(total=total_iterations, desc="Processing") as pbar:
        for i in range(len(test)):
            # for i in (range(5)):

            text = test["text"][0]

            prediction = classifier(text)
            label = prediction[0]["label"]
            score = prediction[0]["score"]

            y_score.append(score)

            predicted_class_id = label2id[label]
            y_pred.append(predicted_class_id)

            ##----seperate cancer,nonCancer score
            if predicted_class_id == 1:
                y_score_for_cancer.append(score)
            else:
                y_score_for_noncancer.append(score)

            ## Threshold adjustment for prob score
            # if predicted_class_id == 0:
            #   ##setting threshold above which predict as NEGATIVE
            #   if score > 0.51:
            #     y_pred.append(0)
            #   else:
            #     y_pred.append(1)
            # else:
            #   y_pred.append(predicted_class_id)

            # Update every 20%
            if (i + 1) % update_interval == 0 or (i + 1) == total_iterations:
                pbar.update(update_interval)  # Increment progress bar

    return y_pred, y_score, y_score_for_cancer, y_score_for_noncancer


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


def evaluate(y_true, y_pred):
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f"Accuracy: {accuracy:.3f}")

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f"Accuracy for label {label}: {accuracy:.3f}")

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print("\nClassification Report:")
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [None]:
y_pred, y_score = predict(test_dataset, model, tokenizer)

Processing: 100%|██████████| 200/200 [01:23<00:00,  2.39it/s]


In [None]:
y_true = test_dataset["label"]

In [173]:
logging.set_verbosity_error()  # Suppress warnings and informational messages
evaluate(y_true, y_pred)

Accuracy: 0.495
Accuracy for label 0: 0.000
Accuracy for label 1: 1.000

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       101
           1       0.49      1.00      0.66        99

    accuracy                           0.49       200
   macro avg       0.25      0.50      0.33       200
weighted avg       0.25      0.49      0.33       200


Confusion Matrix:
[[  0 101   0]
 [  0  99   0]
 [  0   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Model finetuning

function that passes your predictions and labels to compute to calculate the accuracy during finetuning

In [None]:
import numpy as np


# Define and compute the metrics.
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(labels, pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, pred, average="binary"
    )

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return accuracy.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="outputs_model_training",
    learning_rate=2e-5,  # 1e-4
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="steps",
    eval_strategy="steps",
    eval_steps=0.2,
    save_steps=0.2,
    logging_steps=1,  # ADDED
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_dataset,
    eval_dataset=tokenized_val_dataset,
    # tokenizer=tokenizer,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.train()

  trainer = Trainer(


{'loss': 0.743, 'grad_norm': 2.4259843826293945, 'learning_rate': 2e-05, 'epoch': 0.02631578947368421}
{'loss': 0.6869, 'grad_norm': 1.0518428087234497, 'learning_rate': 1.9473684210526318e-05, 'epoch': 0.05263157894736842}
{'loss': 0.6864, 'grad_norm': 1.9483343362808228, 'learning_rate': 1.894736842105263e-05, 'epoch': 0.07894736842105263}
{'loss': 0.6977, 'grad_norm': 1.1464289426803589, 'learning_rate': 1.8421052631578947e-05, 'epoch': 0.10526315789473684}
{'loss': 0.7029, 'grad_norm': 0.9467798471450806, 'learning_rate': 1.7894736842105264e-05, 'epoch': 0.13157894736842105}
{'loss': 0.6933, 'grad_norm': 1.029748558998108, 'learning_rate': 1.736842105263158e-05, 'epoch': 0.15789473684210525}
{'loss': 0.7025, 'grad_norm': 2.184460401535034, 'learning_rate': 1.6842105263157896e-05, 'epoch': 0.18421052631578946}
{'loss': 0.7026, 'grad_norm': 0.8858237266540527, 'learning_rate': 1.6315789473684213e-05, 'epoch': 0.21052631578947367}
{'eval_loss': 0.689586877822876, 'eval_accuracy': 0.54

TrainOutput(global_step=38, training_loss=0.6761058270931244, metrics={'train_runtime': 2568.4329, 'train_samples_per_second': 0.234, 'train_steps_per_second': 0.015, 'train_loss': 0.6761058270931244, 'epoch': 1.0})

# Evaluate model after finetuning

In [None]:
y_pred, y_score, y_score_for_cancer, y_score_for_noncancer = predict(
    test_dataset, model, tokenizer
)

Processing: 100%|██████████| 200/200 [01:27<00:00,  2.29it/s]


In [None]:
y_true = test_dataset["label"]

In [249]:
logging.set_verbosity_error()  # Suppress warnings and informational messages
evaluate(y_true, y_pred)

Accuracy: 0.565
Accuracy for label 0: 0.693
Accuracy for label 1: 0.434

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.69      0.62       101
           1       0.58      0.43      0.50        99

    accuracy                           0.56       200
   macro avg       0.57      0.56      0.56       200
weighted avg       0.57      0.56      0.56       200


Confusion Matrix:
[[70 31  0]
 [56 43  0]
 [ 0  0  0]]


sample model output

In [None]:
classifier = pipeline(task="text-classification", model=model, tokenizer=tokenizer)

text = test_dataset["text"][0]
actual_label = test_dataset["label"][0]
prediction = classifier(text)

label = prediction[0]["label"]
score = prediction[0]["score"]


print(f"predicted label : {label} , Score: {score}")
print(f"actual label: {id2label[actual_label]}")

predicted label : NEGATIVE , Score: 0.5051034688949585
actual label: NEGATIVE




Once training is completed, share your model to the Hub with the push_to_hub() method for future use

In [None]:
trainer.push_to_hub()

In [256]:
wandb.finish()
model.config.use_cache = True

0,1
eval/accuracy,▁▆██
eval/f1,▁▇██
eval/loss,█▆▄▁
eval/precision,▅▁█▃
eval/recall,▁█▆█
eval/runtime,▁█▄▂
eval/samples_per_second,█▁▅▇
eval/steps_per_second,█▁██
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.76
eval/f1,0.79661
eval/loss,0.64711
eval/precision,0.74016
eval/recall,0.86239
eval/runtime,164.7439
eval/samples_per_second,1.214
eval/steps_per_second,0.079
total_flos,78364815320544.0
train/epoch,1.0
