In [1]:
!pip install streamlit
!pip install datasets


Collecting streamlit
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.43.2-py2.py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m67.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m117.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m


In [2]:
import torch
import pandas as pd
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import json
import os
from datasets import Dataset
import numpy as np

# Disable Weights & Biases logging
os.environ["WANDB_DISABLED"] = "true"

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load manually uploaded ChemProt dataset
train_path = "/content/train.jsonl"
test_path = "/content/test.jsonl"

def load_jsonl_data(file_path):
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]

train_data = load_jsonl_data(train_path)
test_data = load_jsonl_data(test_path)
print("Dataset loaded successfully!")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("michiyasunaga/BioLinkBERT-base")

# Map labels to numeric values
unique_labels = list(set(d['label'] for d in train_data))
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

def preprocess_data(examples):
    inputs = tokenizer([ex['text'] for ex in examples], padding=True, truncation=True, max_length=512)
    labels = [label_to_id[ex['label']] for ex in examples]
    inputs['labels'] = labels
    return inputs

train_encodings = preprocess_data(train_data)
test_encodings = preprocess_data(test_data)

train_dataset = Dataset.from_dict(train_encodings)
test_dataset = Dataset.from_dict(test_encodings)

model = AutoModelForSequenceClassification.from_pretrained("michiyasunaga/BioLinkBERT-base", num_labels=len(unique_labels)).to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir="/content/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

# Evaluate model
def compute_metrics(pred):
    labels = np.array(pred.label_ids)  # Ensure labels are a NumPy array
    preds = np.argmax(pred.predictions, axis=1)  # Take the highest probability class
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Get predictions
eval_results = trainer.predict(test_dataset)
eval_metrics = compute_metrics(eval_results)
print("Evaluation Metrics:", eval_metrics)

# Save the trained model
model.save_pretrained("/content/chemprot_relation_model")
tokenizer.save_pretrained("/content/chemprot_relation_model")

# Streamlit Web UI
st.title("Biomedical Relation Extraction")
st.write("Enter biomedical text and extract relations between entities.")

# Input field
text_input = st.text_area("Enter biomedical text:", "")

# Entity Selection
types = ["Gene", "Protein", "Disease", "Drug"]
selected_entity = st.selectbox("Select entity type:", types)

# Extract Relations
if st.button("Extract Relations"):
    if text_input.strip():
        inputs = tokenizer(text_input, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
        predicted_label = torch.argmax(logits, dim=1).item()
        relation = id_to_label[predicted_label]
        st.success(f"Extracted Relation: {relation}")
    else:
        st.error("Please enter some text.")


Using device: cuda
Dataset loaded successfully!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/447k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,0.8486,0.542008
2,0.371,0.570972
3,0.223,0.586251


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation Metrics: {'accuracy': 0.8659556068031133, 'precision': 0.8525670469968832, 'recall': 0.8659556068031133, 'f1': 0.8579941091041737}


2025-03-14 15:59:45.023 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-03-14 15:59:45.031 Session state does not function when running a script without `streamlit run`
