In [1]:
!pip install transformers torch pandas numpy



In [2]:
import pandas as pd

# Load the file you just uploaded
df = pd.read_csv('clinical_trials_clean.csv')

# Let's create a "Text Blob" for the AI to read
# We will combine the Condition and the Title
df['text_feature'] = "Condition: " + df['condition_name'] + ". Title: " + df['sponsor_name']  + " - " + df['phase']

print(f"Loaded {len(df)} rows.")
print("\n--- Example of what the AI will read ---")
print(df['text_feature'].iloc[0])

Loaded 686 rows.

--- Example of what the AI will read ---
Condition: Healthy Volunteers. Title: Genentech, Inc. - PHASE1


In [3]:
import torch
from transformers import AutoTokenizer, AutoModel

# 1. Define the model name (from Hugging Face Hub)
model_name = "michiyasunaga/BioLinkBERT-base"

print(f"Downloading {model_name}... (This takes about 30 seconds)")

# 2. Load the Tokenizer (The translator from Words -> Numbers)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 3. Load the Model (The Brain)
model = AutoModel.from_pretrained(model_name)

# 4. Move the model to the GPU (The Turbo Button)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print(f"Model loaded on: {device.upper()}")

Downloading michiyasunaga/BioLinkBERT-base... (This takes about 30 seconds)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Model loaded on: CUDA


In [4]:
import numpy as np
from tqdm import tqdm # This makes a cool progress bar

def get_bert_embeddings(text_list, batch_size=32):
    """
    Runs text through BioLinkBERT in batches and returns the vectors.
    """
    all_embeddings = []

    # Loop through the data in small chunks (batches) so we don't run out of RAM
    for i in tqdm(range(0, len(text_list), batch_size)):
        batch_texts = text_list[i : i + batch_size]

        # 1. Tokenize (Turn text into code numbers)
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

        # Move inputs to GPU
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # 2. Feed to Model (No gradients needed, we are just reading)
        with torch.no_grad():
            outputs = model(**inputs)

        # 3. Grab the [CLS] token (The first token, which summarizes the sentence)
        # shape: [batch_size, 768]
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

        all_embeddings.append(embeddings)

    # Stack all batches together
    return np.vstack(all_embeddings)

print("Embedding function created.")

Embedding function created.


In [6]:
# 1. Get the list of text strings we made earlier
# Filter out NaN values before converting to list
texts = df['text_feature'].dropna().tolist()

print("Starting NLP extraction... watch the progress bar!")

# 2. Run the function
embeddings = get_bert_embeddings(texts)

print(f"\n Done! Extracted shape: {embeddings.shape}")
# Expected output: (686, 768) -> 686 trials, 768 dimensions each

Starting NLP extraction... watch the progress bar!


100%|██████████| 11/11 [00:01<00:00, 10.03it/s]


 Done! Extracted shape: (345, 768)





In [8]:
# 1. Turn the numpy array (math matrix) into a DataFrame
# We label columns "bert_0" to "bert_767"
embedding_cols = [f'bert_{i}' for i in range(embeddings.shape[1])]
df_embeddings = pd.DataFrame(embeddings, columns=embedding_cols)

# 2. Glue the original IDs and Targets back on
# We reset index to make sure they line up perfectly
df_final = pd.concat([df[['nct_id', 'target']].reset_index(drop=True), df_embeddings], axis=1)

print(f"Final Dataset Shape: {df_final.shape}")
# Should be (345, 770) -> ID + Target + 768 BERT features

# 3. Save to CSV
output_filename = 'clinical_trials_with_bert.csv'
df_final.to_csv(output_filename, index=False)

print(f" File saved inside Colab: {output_filename}")

Final Dataset Shape: (686, 770)
 File saved inside Colab: clinical_trials_with_bert.csv
