In [62]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification
)
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
import os
from torch.utils.data import TensorDataset
from sklearn.metrics import confusion_matrix,classification_report
from torch.utils.data import DataLoader


In [30]:
# Add path to your root dir (Training Folder)

folder_path= 'E:\\Case Comp\\NEST\\Training\\'
epochs = 1

In [31]:
# Step 1: Load Dataset
def load_data(file_path):
    """Loads the dataset and provides an initial overview."""
    data = pd.read_csv(file_path)
    print("Initial Dataset Info:\n", data.info())
    print("\nSample Data:\n", data.head())
    return data


df = load_data(folder_path + "usecase_3_.csv")
test_df = pd.read_excel(folder_path + "TESTWITHSTUDY.xlsx")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257577 entries, 0 to 257576
Data columns (total 32 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Unnamed: 0.1                257577 non-null  int64  
 1   Unnamed: 0                  257577 non-null  int64  
 2   NCT Number                  257577 non-null  object 
 3   Study Title                 257577 non-null  object 
 4   Study URL                   257577 non-null  object 
 5   Acronym                     63991 non-null   object 
 6   Study Status                257577 non-null  object 
 7   Brief Summary               257577 non-null  object 
 8   Study Results               257577 non-null  object 
 9   Conditions                  257577 non-null  object 
 10  Interventions               234064 non-null  object 
 11  Primary Outcome Measures    247086 non-null  object 
 12  Secondary Outcome Measures  185779 non-null  object 
 13  Other Outcome 

In [32]:
df = df.head(500)
test_df = test_df.head(500)

In [33]:
list(df.columns)

['Unnamed: 0.1',
 'Unnamed: 0',
 'NCT Number',
 'Study Title',
 'Study URL',
 'Acronym',
 'Study Status',
 'Brief Summary',
 'Study Results',
 'Conditions',
 'Interventions',
 'Primary Outcome Measures',
 'Secondary Outcome Measures',
 'Other Outcome Measures',
 'Sponsor',
 'Collaborators',
 'Sex',
 'Age',
 'Phases',
 'Enrollment',
 'Funder Type',
 'Study Type',
 'Study Design',
 'Other IDs',
 'Start Date',
 'Primary Completion Date',
 'Completion Date',
 'First Posted',
 'Results First Posted',
 'Last Update Posted',
 'Locations',
 'Study Documents']

In [34]:
test_df = test_df.dropna(how='all')
test_df.shape

(500, 31)

In [35]:
# Combine all text attributes into a single column 'Unstructured'
df["Outcomes"] = df[
    ["Primary Outcome Measures", "Conditions", "Secondary Outcome Measures", "Other Outcome Measures"]
].astype(str).agg(" [SEP] ".join, axis=1)

test_df["Outcomes"] = test_df[
    ["Primary Outcome Measures", "Conditions", "Secondary Outcome Measures", "Other Outcome Measures"]
].astype(str).agg(" [SEP] ".join, axis=1)

In [36]:
# Drop the original text columns
df = df[["Brief Summary", "Study Title", "Conditions", "Outcomes", "NCT Number", "Study Status"]]
test_df = test_df[["Brief Summary", "Study Title", "Conditions", "Outcomes", "NCT Number", "Study Status"]]
# Display updated DataFrame
print(df)

                                         Brief Summary  \
0    The hypothesis of this study is use of CytoSor...   
1    The primary purpose of this study is to find o...   
2    To study the impact of 3 day exposure to atorv...   
3    The purpose of this study is to identify the f...   
4    The proposed protocol is a double-blind, place...   
..                                                 ...   
495  To analyze heterogeneity in ADHD experts in la...   
496  Periodontitis patients, 40 cigarette smokers a...   
497  This was a double-blinded randomized controlle...   
498  This clinical trial is Phase II trial for eval...   
499  This is a single-center, open-label phase I cl...   

                                           Study Title  \
0    Efficacy Study of CytoSorb Hemoperfusion Devic...   
1    Safety and Tolerability Study of AZD7762 in Co...   
2    Does Atorvastatin Reduce Ischemia-Reperfusion ...   
3    Comparison of Dynamic Radiographs in Determini...   
4    A Placeb

In [37]:
# 1. Select Specific Columns
selected_columns = ["Brief Summary", "Study Title", "Conditions", "Outcomes", "NCT Number", "Study Status"]

df = df[selected_columns]
test_df = test_df[selected_columns]

# 2. Display the Updated DataFrame
print("Updated Training DataFrame (df):")
print(df.head())  # Display the first few rows for brevity

print("\nUpdated Test DataFrame (test_df):")
print(test_df.head())

# 3. Verify and Handle Missing Data

def verify_and_handle_missing_data(dataframe, dataframe_name="DataFrame"):
    # Check for missing values
    missing_values = dataframe.isnull().sum()
    print(f"\nMissing Values in {dataframe_name}:")
    print(missing_values)
    
    # Columns that should not have missing values
    critical_columns = ["NCT Number", "Study Status"]
    
    # Check if critical columns have missing values
    missing_in_critical = dataframe[critical_columns].isnull().sum()
    if missing_in_critical.any():
        missing_cols = missing_in_critical[missing_in_critical > 0].index.tolist()
        raise ValueError(f"Missing values found in critical columns {missing_cols} of {dataframe_name}.")
    else:
        print(f"No missing values found in critical columns {critical_columns} of {dataframe_name}.")
    
    # Fill missing values in other columns with "Unknown"
    non_critical_columns = [col for col in dataframe.columns if col not in critical_columns]
    dataframe[non_critical_columns] = dataframe[non_critical_columns].fillna("Unknown")
    print(f"Filled missing values in non-critical columns with 'Unknown' in {dataframe_name}.")

# Apply the verification and handling to both df and test_df
try:
    verify_and_handle_missing_data(df, "Training DataFrame (df)")
    verify_and_handle_missing_data(test_df, "Test DataFrame (test_df)")
except ValueError as e:
    print(e)
    # Depending on your use case, you might want to handle the error differently
    # For example, you could remove rows with missing critical data:
    # df.dropna(subset=critical_columns, inplace=True)
    # Or take other appropriate actions

# Optional: Verify that there are no missing values left
print("\nFinal Missing Values in Training DataFrame (df):")
print(df.isnull().sum())

print("\nFinal Missing Values in Test DataFrame (test_df):")
print(test_df.isnull().sum())

Updated Training DataFrame (df):
                                       Brief Summary  \
0  The hypothesis of this study is use of CytoSor...   
1  The primary purpose of this study is to find o...   
2  To study the impact of 3 day exposure to atorv...   
3  The purpose of this study is to identify the f...   
4  The proposed protocol is a double-blind, place...   

                                         Study Title  \
0  Efficacy Study of CytoSorb Hemoperfusion Devic...   
1  Safety and Tolerability Study of AZD7762 in Co...   
2  Does Atorvastatin Reduce Ischemia-Reperfusion ...   
3  Comparison of Dynamic Radiographs in Determini...   
4  A Placebo-Controlled Study of Mixed Amphetamin...   

                                          Conditions  \
0  Acute Respiratory Distress Syndrome|Acute Lung...   
1    Cancer|Solid Tumors|Advanced Solid Malignancies   
2  Ischemia Reperfusion Injury|Cardiovascular Dis...   
3                    Adolescent Idiopathic Scoliosis   
4            

In [38]:
print("DataFrame head:")
print(df.head())

# Optional: Check how many rows we have
print(f"Total samples in dataset: {len(df)}")

DataFrame head:
                                       Brief Summary  \
0  The hypothesis of this study is use of CytoSor...   
1  The primary purpose of this study is to find o...   
2  To study the impact of 3 day exposure to atorv...   
3  The purpose of this study is to identify the f...   
4  The proposed protocol is a double-blind, place...   

                                         Study Title  \
0  Efficacy Study of CytoSorb Hemoperfusion Devic...   
1  Safety and Tolerability Study of AZD7762 in Co...   
2  Does Atorvastatin Reduce Ischemia-Reperfusion ...   
3  Comparison of Dynamic Radiographs in Determini...   
4  A Placebo-Controlled Study of Mixed Amphetamin...   

                                          Conditions  \
0  Acute Respiratory Distress Syndrome|Acute Lung...   
1    Cancer|Solid Tumors|Advanced Solid Malignancies   
2  Ischemia Reperfusion Injury|Cardiovascular Dis...   
3                    Adolescent Idiopathic Scoliosis   
4                             

In [39]:
df["Study Status"] = df["Study Status"].apply(
    lambda x: 1 if x == "COMPLETED" else 0
)

test_df["Study Status"] = test_df["Study Status"].apply(
    lambda x: 1 if x == "COMPLETED" else 0
)
test_df

Unnamed: 0,Brief Summary,Study Title,Conditions,Outcomes,NCT Number,Study Status
0,This study is a post-market clinical follow-up...,Patient Outcomes Using an Expandable Spacer,Degenerative Disc Disease,"Change in Radiographic Analysis, Global and Se...",NCT03162666,0
1,The aim of the study is to evaluate the effica...,the Effect of Isosorbide Mononitrate in Reduci...,IUD Insertion Pain,"pain during IUD insertion, intensity of patien...",NCT04312048,1
2,This phase I trial studies how well durvalumab...,Durvalumab With or Without Tremelimumab in Tre...,Stage II Oropharyngeal Squamous Cell Carcinoma...,"Change of CD8+ tumor infiltrating lymphocytes,...",NCT03144778,1
3,The incorporation of novel targeted therapies ...,Radiation and Cetuximab Plus Intratumoral EGFR...,Squamous Cell Carcinoma|Head and Neck Cancer,"Toxicity Rate, This is a 2-stage clinical tria...",NCT01592721,1
4,"The use of nonsurgical periodontal treatment, ...",Laser Biostimulation in Periodontal Treatment,Periodontal Inflammation|Periodontal Diseases,"IL-1β level in GCF, IL-1β is a cytokine presen...",NCT04253613,1
...,...,...,...,...,...,...
495,OBJECTIVES: The aim of the study is to evaluat...,Radiofrequency Female External Genital Region:...,Dissatisfaction Appearance of the External Gen...,"clinical response, To evaluate the clinical re...",NCT02611791,1
496,The proposed study evaluates the impact and im...,Journey of Life Psychosocial Support Program,Psychological|Child Development|Social Values,"Mental health, Changes in mental health sympto...",NCT04817098,1
497,This study is to find out what role a local in...,Human Intestinal Amino Acid Absorption and the...,Renin-Angiotensin Aldosterone System (RAS),messenger ribonucleic acid (mRNA) quantificati...,NCT04524494,1
498,A phase II single arm study of carboplatin and...,A Phase II Study of Docetaxel and Carboplatin ...,Ovarian Epithelial Cancer Recurrent,"Safety, Safety will be established by grading ...",NCT02026921,1


In [40]:
# Separate features (X) and target variable (y) for the training set
X_train = df.drop('Study Status', axis=1)
y_train = df['Study Status']

# Separate features (X) and target variable (y) for the testing set
X_test = test_df.drop('Study Status', axis=1)
y_test = test_df['Study Status']

# Print the shapes of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (500, 5) (500,)
Testing set shape: (500, 5) (500,)


In [41]:
X_train.head()

Unnamed: 0,Brief Summary,Study Title,Conditions,Outcomes,NCT Number
0,The hypothesis of this study is use of CytoSor...,Efficacy Study of CytoSorb Hemoperfusion Devic...,Acute Respiratory Distress Syndrome|Acute Lung...,Relative IL-6 levels as a percent (%) of basel...,NCT00559130
1,The primary purpose of this study is to find o...,Safety and Tolerability Study of AZD7762 in Co...,Cancer|Solid Tumors|Advanced Solid Malignancies,Assessment of adverse events (based on CTCAE v...,NCT00937664
2,To study the impact of 3 day exposure to atorv...,Does Atorvastatin Reduce Ischemia-Reperfusion ...,Ischemia Reperfusion Injury|Cardiovascular Dis...,Annexin A 5 targeting in the non dominant then...,NCT00441597
3,The purpose of this study is to identify the f...,Comparison of Dynamic Radiographs in Determini...,Adolescent Idiopathic Scoliosis,Investigate the flexibility equivalence of dif...,NCT03296228
4,"The proposed protocol is a double-blind, place...",A Placebo-Controlled Study of Mixed Amphetamin...,Cocaine Dependence,Three Weeks of Continuous Cocaine Abstinence a...,NCT00421603


In [42]:
X_train.shape

(500, 5)

In [43]:
y_train.head()

0    1
1    0
2    1
3    1
4    1
Name: Study Status, dtype: int64

In [44]:
y_train.shape

(500,)

In [45]:
print("X_train Shape:", X_train.shape)  # Expected: (206061, 2)
print("y_train Shape:", y_train.shape)  # Expected: (206061, 1) or (206061,)

print("\nX_train Index Sample:", X_train.index[:5].tolist())
print("\ny_train Index Sample:", y_train.index[:5].tolist())

print("\nX_train Type:", type(X_train))
print("y_train Type:", type(y_train))


X_train Shape: (500, 5)
y_train Shape: (500,)

X_train Index Sample: [0, 1, 2, 3, 4]

y_train Index Sample: [0, 1, 2, 3, 4]

X_train Type: <class 'pandas.core.frame.DataFrame'>
y_train Type: <class 'pandas.core.series.Series'>


In [46]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

# Convert y_train to DataFrame if it's a Series or NumPy array
if isinstance(y_train, pd.Series):
    y_train = y_train.to_frame()
elif isinstance(y_train, np.ndarray):
    y_train = pd.DataFrame(y_train, columns=["Label"])

# Explicitly rename column
y_train.columns = ["Label"]

# Concatenate X_train and y_train along columns
train_data = pd.concat([X_train, y_train], axis=1)

# Print first rows to confirm merge
print(train_data.head())


                                       Brief Summary  \
0  The hypothesis of this study is use of CytoSor...   
1  The primary purpose of this study is to find o...   
2  To study the impact of 3 day exposure to atorv...   
3  The purpose of this study is to identify the f...   
4  The proposed protocol is a double-blind, place...   

                                         Study Title  \
0  Efficacy Study of CytoSorb Hemoperfusion Devic...   
1  Safety and Tolerability Study of AZD7762 in Co...   
2  Does Atorvastatin Reduce Ischemia-Reperfusion ...   
3  Comparison of Dynamic Radiographs in Determini...   
4  A Placebo-Controlled Study of Mixed Amphetamin...   

                                          Conditions  \
0  Acute Respiratory Distress Syndrome|Acute Lung...   
1    Cancer|Solid Tumors|Advanced Solid Malignancies   
2  Ischemia Reperfusion Injury|Cardiovascular Dis...   
3                    Adolescent Idiopathic Scoliosis   
4                                 Cocaine Depe

In [47]:
# Count occurrences of each label
label_counts = train_data["Label"].value_counts()

# Print results
print(label_counts)


Label
1    431
0     69
Name: count, dtype: int64


In [48]:
majority_class = train_data[train_data["Label"] == 1]  # 176,886 rows
minority_class = train_data[train_data["Label"] == 0]  # 29,175 rows

# Calculate the desired oversampling ratio
oversampling_ratio = 1.2

# Calculate the target size for the minority class
target_minority_size = int(len(majority_class) * oversampling_ratio)

# Calculate how many times to duplicate the minority class
duplication_factor = target_minority_size // len(minority_class)  # Integer factor
remainder = target_minority_size % len(minority_class)  # Extra rows needed

# Duplicate the minority class
balanced_minority_class = pd.concat([minority_class] * duplication_factor, ignore_index=True)

# Add extra samples to match the target size
extra_samples = minority_class.sample(n=remainder, replace=True, random_state=42)  # Resample to match exact count

# Merge the newly balanced data
balanced_train_data = pd.concat([majority_class, balanced_minority_class, extra_samples], ignore_index=True)

# Shuffle the dataset
balanced_train_data = balanced_train_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Print new class counts
print("New Class Distribution:\n", balanced_train_data["Label"].value_counts())

New Class Distribution:
 Label
0    517
1    431
Name: count, dtype: int64


In [49]:
balanced_train_data.head()

Unnamed: 0,Brief Summary,Study Title,Conditions,Outcomes,NCT Number,Label
0,This study is a Post Market Clinical Follow up...,Post Market Clinical Follow-Up of the Zimmer S...,"Osteoarthritis, Hip|Fracture of Hip|Avascular ...","Implant Survival, Represents the implants that...",NCT04079114,0
1,"Verruca vulgaris, otherwise known as the commo...",The Purpose of This Study is to Determine Whet...,Warts,Safety [SEP] Warts [SEP] Resolution of Common ...,NCT00546611,0
2,"This study will be conducted as a randomized, ...",Trial Comparing Effects of Xyrem Taken Orally ...,Narcolepsy,Daytime Sleep Latency as Measured by the Maint...,NCT00066170,1
3,This is a research study on Altitude Illness. ...,Prevention of Altitude Illness With Non-steroi...,Altitude Sickness,"Acute Mountain Sickness, Lake Louise Criteria ...",NCT01171794,1
4,This is a study in people with an eye disease ...,A Study to Test Different Doses of BI 836880 i...,Wet Macular Degeneration,Single Rising Dose (SRD) part: Number of patie...,NCT03861234,1


In [50]:
# Separate features (X_train) and target labels (y_train)
X_train = balanced_train_data.drop(columns=["Label"])  # Drop the label column
y_train = balanced_train_data["Label"]  # Keep only the label column

# Display shapes to confirm correctness
print("X_train Shape:", X_train.shape)
print("y_train Shape:", y_train.shape)


X_train Shape: (948, 5)
y_train Shape: (948,)


In [51]:
X_train.head()

Unnamed: 0,Brief Summary,Study Title,Conditions,Outcomes,NCT Number
0,This study is a Post Market Clinical Follow up...,Post Market Clinical Follow-Up of the Zimmer S...,"Osteoarthritis, Hip|Fracture of Hip|Avascular ...","Implant Survival, Represents the implants that...",NCT04079114
1,"Verruca vulgaris, otherwise known as the commo...",The Purpose of This Study is to Determine Whet...,Warts,Safety [SEP] Warts [SEP] Resolution of Common ...,NCT00546611
2,"This study will be conducted as a randomized, ...",Trial Comparing Effects of Xyrem Taken Orally ...,Narcolepsy,Daytime Sleep Latency as Measured by the Maint...,NCT00066170
3,This is a research study on Altitude Illness. ...,Prevention of Altitude Illness With Non-steroi...,Altitude Sickness,"Acute Mountain Sickness, Lake Louise Criteria ...",NCT01171794
4,This is a study in people with an eye disease ...,A Study to Test Different Doses of BI 836880 i...,Wet Macular Degeneration,Single Rising Dose (SRD) part: Number of patie...,NCT03861234


In [52]:
y_train.head()

0    0
1    0
2    1
3    1
4    1
Name: Label, dtype: int64

In [53]:
# X_train.to_csv(folder_path + "X_train.csv", index=False)
# X_test.to_csv(folder_path + "X_test.csv", index=False)
# y_train.to_csv(folder_path + "y_train.csv", index=False)
# y_test.to_csv(folder_path + "y_test.csv", index=False)
# X_train

In [54]:

print("CUDA Available:", torch.cuda.is_available())
print("Current Device:", torch.cuda.current_device())
print("Device Name:", torch.cuda.get_device_name(0))
print("PyTorch using CUDA:", torch.backends.cudnn.enabled)

CUDA Available: False


AssertionError: Torch not compiled with CUDA enabled

In [55]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Ensure only GPU 0 is used


# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load Data
texts = X_train['Study Title'].tolist()
labels = y_train.tolist()

# Split Data into Training and Validation Sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Load BioBERT Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2)

# Move model to GPU
model.to(device)

# Check model device
print("Model is on:", next(model.parameters()).device)

# Create Datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_length=56)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_length=56)

# Training Arguments (Ensure GPU usage)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # Reduce if OOM occurs
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True,  # Enable mixed precision training
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Fine-Tune Model
trainer.train()

# Save Fine-Tuned Model
model.save_pretrained(folder_path +"study_title_fine_tuned_clinicalbiobert_v3")
tokenizer.save_pretrained(folder_path + "study_title_fine_tuned_clinicalbiobert_v3")

print(f"Fine-tuned BioBERT model saved at {folder_path +"study_title_fine_tuned_clinicalbiobert_v3"}")
print("Model is on:", next(model.parameters()).device)

Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cpu


 49%|████▊     | 70/144 [03:35<03:47,  3.07s/it]
 21%|██        | 10/48 [00:16<00:59,  1.56s/it]

{'loss': 0.7001, 'grad_norm': 2.827866792678833, 'learning_rate': 1.5833333333333333e-05, 'epoch': 0.21}


 42%|████▏     | 20/48 [00:31<00:42,  1.52s/it]

{'loss': 0.6763, 'grad_norm': 3.411410093307495, 'learning_rate': 1.1666666666666668e-05, 'epoch': 0.42}


 62%|██████▎   | 30/48 [00:46<00:27,  1.51s/it]

{'loss': 0.6697, 'grad_norm': 2.7578377723693848, 'learning_rate': 7.500000000000001e-06, 'epoch': 0.62}


 83%|████████▎ | 40/48 [01:01<00:12,  1.52s/it]

{'loss': 0.6281, 'grad_norm': 3.3203423023223877, 'learning_rate': 3.3333333333333333e-06, 'epoch': 0.83}


                                               
100%|██████████| 48/48 [01:19<00:00,  1.29s/it]

{'eval_loss': 0.6134178638458252, 'eval_runtime': 5.1077, 'eval_samples_per_second': 37.199, 'eval_steps_per_second': 2.349, 'epoch': 1.0}


100%|██████████| 48/48 [01:21<00:00,  1.69s/it]


{'train_runtime': 81.2985, 'train_samples_per_second': 9.324, 'train_steps_per_second': 0.59, 'train_loss': 0.6605633397897085, 'epoch': 1.0}
Fine-tuned BioBERT model saved at './fine_tuned_biobert_v3'
Model is on: cpu


In [None]:
# Load the fine-tuned model and tokenizer
model_path = folder_path + "study_title_fine_tuned_clinicalbiobert_v3"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

# Tokenize texts
def encode_texts(texts, tokenizer, max_length=56):
    encodings = tokenizer(
        texts,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    return encodings

# Inference and evaluation function
def evaluate_model(dataframe, tokenizer, model, device, output_csv_name, text_column, max_length=56, batch_size=8):
    texts = dataframe[text_column].tolist()
    nct_numbers = dataframe["NCT Number"].tolist()
    labels = dataframe["Study Status"].tolist()

    # Tokenize data
    encodings = encode_texts(texts, tokenizer, max_length)
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    # # Free unused memory
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.reset_accumulated_memory_stats()

    # Create DataLoader
    dataset = TensorDataset(input_ids, attention_mask)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    predictions = []
    prediction_probs = []

    # Run inference
    with torch.no_grad():
        for batch in data_loader:
            batch_input_ids, batch_attention_mask = batch
            batch_input_ids = batch_input_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)

            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)  # Get prediction probabilities

            batch_predictions = torch.argmax(logits, dim=1).cpu().numpy()
            batch_probs = probs.cpu().numpy()

            predictions.extend(batch_predictions)
            prediction_probs.extend(batch_probs)

    # Convert true labels to NumPy
    true_labels = torch.tensor(labels).numpy()

    # Compute Confusion Matrix
    conf_matrix = confusion_matrix(true_labels, predictions)
    print(f"\nConfusion Matrix for {output_csv_name}:")
    print(conf_matrix)

    # Compute Classification Report
    class_report = classification_report(true_labels, predictions, target_names=["Class 0", "Class 1"])
    print(f"\nClassification Report for {output_csv_name}:")
    print(class_report)

    # Save predictions and probabilities to a DataFrame
    pred_df = pd.DataFrame({
        "NCT Number": nct_numbers,
        "True Label": true_labels,
        "Predicted Label": predictions,
        "Probability Class 0": [prob[0] for prob in prediction_probs],
        "Probability Class 1": [prob[1] for prob in prediction_probs]
    })

    # Save to CSV
    pred_df.to_csv(output_csv_name, index=False)
    print(f"Predictions saved to {output_csv_name}")

# Evaluate on training data (df)
evaluate_model(
    dataframe=df,
    tokenizer=tokenizer,
    model=model,
    device=device,
    output_csv_name= folder_path + "study_title_clinicalbiobert_predictions_train.csv",
    text_column="Study Title",
    max_length=128  # Set max_length to 128 for training data
)

# Evaluate on testing data (test_df)
evaluate_model(
    dataframe=test_df,
    tokenizer=tokenizer,
    model=model,
    device=device,
    output_csv_name= folder_path + "study_title_clinicalbiobert_predictions_test.csv",
    text_column="Study Title",
    max_length=128  # Set max_length to 128 for testing data
)


Confusion Matrix for E:\Case Comp\NEST\Training\study_title_clinicalbiobert_predictions_train.csv:
[[ 67   2]
 [244 187]]

Classification Report for E:\Case Comp\NEST\Training\study_title_clinicalbiobert_predictions_train.csv:
              precision    recall  f1-score   support

     Class 0       0.22      0.97      0.35        69
     Class 1       0.99      0.43      0.60       431

    accuracy                           0.51       500
   macro avg       0.60      0.70      0.48       500
weighted avg       0.88      0.51      0.57       500

Predictions saved to E:\Case Comp\NEST\Training\study_title_clinicalbiobert_predictions_train.csv

Confusion Matrix for E:\Case Comp\NEST\Training\study_title_clinicalbiobert_predictions_test.csv:
[[ 39  16]
 [307 138]]

Classification Report for E:\Case Comp\NEST\Training\study_title_clinicalbiobert_predictions_test.csv:
              precision    recall  f1-score   support

     Class 0       0.11      0.71      0.19        55
     Class 1

In [64]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Ensure only GPU 0 is used


# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load Data
texts = X_train['Brief Summary'].tolist()
labels = y_train.tolist()

# Split Data into Training and Validation Sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Load BioBERT Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2)

# Move model to GPU
model.to(device)

# Check model device
print("Model is on:", next(model.parameters()).device)

# Create Datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_length=128)

# Training Arguments (Ensure GPU usage)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # Reduce if OOM occurs
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True,  # Enable mixed precision training
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Fine-Tune Model
trainer.train()

# Save Fine-Tuned Model
model.save_pretrained(folder_path + "brief_summary_fine_tuned_clinicalbiobert_v3")
tokenizer.save_pretrained(folder_path + "brief_summary_fine_tuned_clinicalbiobert_v3")

print("Fine-tuned BioBERT model saved at './fine_tuned_biobert_v3'")
print("Model is on:", next(model.parameters()).device)

Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cpu


 21%|██        | 10/48 [00:34<02:08,  3.37s/it]

{'loss': 0.6914, 'grad_norm': 2.0106403827667236, 'learning_rate': 1.5833333333333333e-05, 'epoch': 0.21}


 42%|████▏     | 20/48 [01:06<01:31,  3.26s/it]

{'loss': 0.7029, 'grad_norm': 2.805807113647461, 'learning_rate': 1.1666666666666668e-05, 'epoch': 0.42}


 62%|██████▎   | 30/48 [01:41<01:00,  3.34s/it]

{'loss': 0.6872, 'grad_norm': 2.1524529457092285, 'learning_rate': 7.500000000000001e-06, 'epoch': 0.62}


 83%|████████▎ | 40/48 [02:13<00:26,  3.30s/it]

{'loss': 0.6619, 'grad_norm': 2.487837553024292, 'learning_rate': 3.3333333333333333e-06, 'epoch': 0.83}


100%|██████████| 48/48 [02:38<00:00,  2.70s/it]
100%|██████████| 48/48 [02:51<00:00,  2.70s/it]

{'eval_loss': 0.6464076042175293, 'eval_runtime': 11.8255, 'eval_samples_per_second': 16.067, 'eval_steps_per_second': 1.015, 'epoch': 1.0}


100%|██████████| 48/48 [02:52<00:00,  3.60s/it]


{'train_runtime': 172.7629, 'train_samples_per_second': 4.388, 'train_steps_per_second': 0.278, 'train_loss': 0.6787485182285309, 'epoch': 1.0}
Fine-tuned BioBERT model saved at './fine_tuned_biobert_v3'
Model is on: cpu


In [32]:
# Evaluate on testing data (test_df) using "Brief Summary" column
evaluate_model(
    dataframe=test_df,
    tokenizer=tokenizer,
    model=model,
    device=device,
    output_csv_name=folder_path + "brief_summary_clinicalbiobert_predictions_test.csv",
    text_column="Brief Summary",
    max_length=128  # Set max_length to 128
)

# Evaluate on training data (df) using "Brief Summary" column
evaluate_model(
    dataframe=df,
    tokenizer=tokenizer,
    model=model,
    device=device,
    output_csv_name=folder_path + "brief_summary_clinicalbiobert_predictions_train.csv",
    text_column="Brief Summary",
    max_length=128  # Set max_length to 128
)


Confusion Matrix for brief_summary_clinicalbiobert_predictions_test.csv:
[[ 4047  5161]
 [10419 44768]]

Classification Report for brief_summary_clinicalbiobert_predictions_test.csv:
              precision    recall  f1-score   support

     Class 0       0.28      0.44      0.34      9208
     Class 1       0.90      0.81      0.85     55187

    accuracy                           0.76     64395
   macro avg       0.59      0.63      0.60     64395
weighted avg       0.81      0.76      0.78     64395

Predictions saved to brief_summary_clinicalbiobert_predictions_test.csv

Confusion Matrix for brief_summary_clinicalbiobert_predictions_train.csv:
[[ 36142    192]
 [ 21837 199406]]

Classification Report for brief_summary_clinicalbiobert_predictions_train.csv:
              precision    recall  f1-score   support

     Class 0       0.62      0.99      0.77     36334
     Class 1       1.00      0.90      0.95    221243

    accuracy                           0.91    257577
   macro 

In [33]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Ensure only GPU 0 is used


# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load Data
texts = X_train['Conditions'].tolist()
labels = y_train.tolist()

# Split Data into Training and Validation Sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Load BioBERT Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2)

# Move model to GPU
model.to(device)

# Check model device
print("Model is on:", next(model.parameters()).device)

# Create Datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_length=4)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_length=4)

# Training Arguments (Ensure GPU usage)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # Reduce if OOM occurs
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True,  # Enable mixed precision training
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Fine-Tune Model
trainer.train()

# Save Fine-Tuned Model
model.save_pretrained(folder_path + "Conditions_fine_tuned_clinicalbiobert_v3")
tokenizer.save_pretrained(folder_path + "Conditions_fine_tuned_clinicalbiobert_v3")

print("Fine-tuned BioBERT model saved at './fine_tuned_biobert_v3'")
print("Model is on:", next(model.parameters()).device)

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0




Epoch,Training Loss,Validation Loss
1,0.6857,0.648017
2,0.7186,0.643998
3,0.6525,0.636457


Fine-tuned BioBERT model saved at './fine_tuned_biobert_v3'
Model is on: cuda:0


In [35]:
# Evaluate on testing data (test_df) using "Conditions" column
evaluate_model(
    dataframe=test_df,
    tokenizer=tokenizer,
    model=model,
    device=device,
    output_csv_name=folder_path + "conditions_clinicalbiobert_predictions_test.csv",
    text_column="Conditions",
    max_length=4  # Specific max_length for "Conditions"
)

# Evaluate on training data (df) using "Conditions" column
evaluate_model(
    dataframe=df,
    tokenizer=tokenizer,
    model=model,
    device=device,
    output_csv_name=folder_path + "conditions_clinicalbiobert_predictions_train.csv",
    text_column="Conditions",
    max_length=4  # Specific max_length for "Conditions"
)


Confusion Matrix for conditions_clinicalbiobert_predictions_test.csv:
[[ 6627  2581]
 [29984 25203]]

Classification Report for conditions_clinicalbiobert_predictions_test.csv:
              precision    recall  f1-score   support

     Class 0       0.18      0.72      0.29      9208
     Class 1       0.91      0.46      0.61     55187

    accuracy                           0.49     64395
   macro avg       0.54      0.59      0.45     64395
weighted avg       0.80      0.49      0.56     64395

Predictions saved to conditions_clinicalbiobert_predictions_test.csv

Confusion Matrix for conditions_clinicalbiobert_predictions_train.csv:
[[ 28641   7693]
 [118027 103216]]

Classification Report for conditions_clinicalbiobert_predictions_train.csv:
              precision    recall  f1-score   support

     Class 0       0.20      0.79      0.31     36334
     Class 1       0.93      0.47      0.62    221243

    accuracy                           0.51    257577
   macro avg       0.56 

In [36]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Ensure only GPU 0 is used


# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load Data
texts = X_train['Outcomes'].tolist()
labels = y_train.tolist()

# Split Data into Training and Validation Sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Load BioBERT Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2)

# Move model to GPU
model.to(device)

# Check model device
print("Model is on:", next(model.parameters()).device)

# Create Datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_length=128)

# Training Arguments (Ensure GPU usage)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # Reduce if OOM occurs
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True,  # Enable mixed precision training
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Fine-Tune Model
trainer.train()

# Save Fine-Tuned Model
model.save_pretrained(folder_path + "Outcomes_fine_tuned_clinicalbiobert_v3")
tokenizer.save_pretrained(folder_path + "Outcomes_fine_tuned_clinicalbiobert_v3")

print("Fine-tuned BioBERT model saved at './fine_tuned_biobert_v3'")
print("Model is on:", next(model.parameters()).device)

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0




Epoch,Training Loss,Validation Loss
1,0.495,0.399919
2,0.2765,0.273772
3,0.2271,0.35227


Fine-tuned BioBERT model saved at './fine_tuned_biobert_v3'
Model is on: cuda:0


In [38]:
# Evaluate on testing data (test_df) using "Outcomes" column
evaluate_model(
    dataframe=test_df,
    tokenizer=tokenizer,
    model=model,
    device=device,
    output_csv_name=folder_path + "outcomes_clinicalbiobert_predictions_test.csv",
    text_column="Outcomes",
    max_length=128  # Specific max_length for "Outcomes"
)

# Evaluate on training data (df) using "Outcomes" column
evaluate_model(
    dataframe=df,
    tokenizer=tokenizer,
    model=model,
    device=device,
    output_csv_name=folder_path + "outcomes_clinicalbiobert_predictions_train.csv",
    text_column="Outcomes",
    max_length=128  # Specific max_length for "Outcomes"
)


Confusion Matrix for outcomes_clinicalbiobert_predictions_test.csv:
[[ 4002  5206]
 [11148 44039]]

Classification Report for outcomes_clinicalbiobert_predictions_test.csv:
              precision    recall  f1-score   support

     Class 0       0.26      0.43      0.33      9208
     Class 1       0.89      0.80      0.84     55187

    accuracy                           0.75     64395
   macro avg       0.58      0.62      0.59     64395
weighted avg       0.80      0.75      0.77     64395

Predictions saved to outcomes_clinicalbiobert_predictions_test.csv

Confusion Matrix for outcomes_clinicalbiobert_predictions_train.csv:
[[ 36012    322]
 [ 24289 196954]]

Classification Report for outcomes_clinicalbiobert_predictions_train.csv:
              precision    recall  f1-score   support

     Class 0       0.60      0.99      0.75     36334
     Class 1       1.00      0.89      0.94    221243

    accuracy                           0.90    257577
   macro avg       0.80      0.94 