In [1]:
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
def ct_dict2pd(study: dict, missing_val=None) -> pd.Series:
    """ETL process to convert a CT study in a JSON format to the format
    exemplified in the trial2vec demo data.
    See: https://pypi.org/project/Trial2Vec/

    Parameters
    ----------
    study : dict
        as provided through the clinicaltrials.gov API

    missing_val: (default:None)
        How to encode missing values

    Returns
    -------
    pd.Series
        fields :
        - nct_id
        - description
        - study_type
        - title
        - intervention name
        - disease
        - keyword
        - outcome measure
        - (selection) criteria
        - references
        - overall status
    """
    missing_val = None

    ct_protocol = study.get("protocolSection", {})

    nct_id = ct_protocol.get("identificationModule", {}).get("nctId", missing_val)

    description = ct_protocol.get("descriptionModule", {}).get(
        "briefSummary", missing_val
    )

    study_type = ct_protocol.get("designModule", {}).get("studyType", missing_val)

    title = ct_protocol["identificationModule"].get(
        "officialTitle",
        ct_protocol["identificationModule"].get("briefTitle", missing_val),
    )

    # Intervention name
    if study_type == "OBSERVATIONAL":
        intervention_name = study_type
    else:
        interventions = ct_protocol.get("armsInterventionsModule", {}).get(
            "interventions", []
        )
        intervention_name = ", ".join(
            set(i.get("name", "").split(":")[-1] for i in interventions)
        )

    disease = ", ".join(
        sorted(ct_protocol.get("conditionsModule", {}).get("conditions", []))
    )

    keyword = (
        ", ".join(sorted(ct_protocol.get("conditionsModule", {}).get("keywords", [])))
        if study_type != "OBSERVATIONAL"
        else missing_val
    )

    # Outcome measurement
    if study_type == "OBSERVATIONAL":
        try:
            design_info = ct_protocol["designModule"]["designInfo"]
            outcome_measure = design_info.get("observationalModel", study_type)
            outcome_measure += (
                "-" + design_info.get("timePerspective", "")
                if "timePerspective" in design_info
                else ""
            )
        except KeyError:
            outcome_measure = study_type
    else:
        primary_outcomes = ct_protocol.get("outcomesModule", {}).get(
            "primaryOutcomes", []
        )
        outcome_measure = ", ".join(set(i.get("measure", "") for i in primary_outcomes))

    # Selection criteria
    try:
        criteria = ct_protocol.get("eligibilityModule", {}).get(
            "eligibilityCriteria", ""
        )
        criteria = criteria.replace("\n* ", "~").replace("\n", "~").replace("~~", "~")
    except:
        try:
            eligibility = ct_protocol.get("eligibilityModule", {})
            criteria = ", ".join(
                [": ".join([k, str(v)]) for k, v in eligibility.items()]
            )
        except:
            criteria = missing_val

    # References
    try:
        references = ct_protocol.get("referencesModule", {}).get("references", [])
        reference = ", ".join(
            r.get("citation", "").split(".")[1].lstrip(" ")
            for r in references
            if "citation" in r
        )
    except KeyError:
        reference = missing_val

    overall_status = (
        ct_protocol.get("statusModule", {}).get("overallStatus", "").lower()
    )

    return (
        pd.Series(
            {
                "nct_id": nct_id,
                "description": description,
                "title": title,
                "intervention_name": intervention_name,
                "disease": disease,
                "keyword": keyword,
                "outcome_measure": outcome_measure,
                "criteria": criteria,
                "reference": reference,
                "overall_status": overall_status,
            }
        )
        .to_frame()
        .transpose()
    )

In [None]:
def ct_dict2pd_old(study: dict) -> pd.Series():

    missing_val = None

    ct_protocol = study["protocolSection"]
    nct_id = ct_protocol["identificationModule"]["nctId"]
    description = ct_protocol["descriptionModule"]["briefSummary"]
    study_type = ct_protocol["designModule"]["studyType"]

    try:

        title = ct_protocol["identificationModule"]["officialTitle"]
    except KeyError:
        title = ct_protocol["identificationModule"]["briefTitle"]

    if study_type == "OBSERVATIONAL":
        intervention_name = study_type
    else:
        interventions = ct_protocol["armsInterventionsModule"]["interventions"]
        intervention_name = ", ".join(
            set(i["name"].split(":")[-1] for i in interventions)
        )

    disease = ", ".join(sorted(ct_protocol["conditionsModule"]["conditions"]))

    try:
        keywords = ct_protocol["conditionsModule"]["keywords"]
        keyword = ", ".join(sorted(keywords))
    except KeyError:
        keyword = missing_val

    if study_type == "OBSERVATIONAL":
        try:
            design_info = ct_protocol["designModule"]["designInfo"]
            outcome_measure = design_info["observationalModel"]
            try:
                outcome_measure += "-" + design_info["timePerspective"]
            except KeyError:
                pass
        except KeyError:
            outcome_measure = study_type
    else:
        try:
            primary_outcomes = ct_protocol["outcomesModule"]["primaryOutcomes"]
            outcome_measure = ", ".join(set(i["measure"] for i in primary_outcomes))
        except KeyError:
            outcome_measure = missing_val

    try:
        criteria = ct_protocol["eligibilityModule"]["eligibilityCriteria"]
        criteria = criteria.replace("\n* ", "~").replace("\n", "~").replace("~~", "~")

    except:
        try:
            elibigility = ct_protocol["eligibilityModule"]
            criteria = ", ".join(
                [": ".join([k, str(v)]) for k, v in elibigility.items()]
            )

        except:
            criteria = missing_val
    try:
        references = ct_protocol["referencesModule"]["references"]

        tmp = []
        for r in references:
            try:
                tmp.append(r["citation"].split(".")[1].lstrip(" "))
            except IndexError:
                pass
        reference = ", ".join(tmp)
    except KeyError:
        reference = missing_val

    overall_status = ct_protocol["statusModule"]["overallStatus"]
    return pd.Series(
        {
            "nct_id": nct_id,
            "description": description,
            "title": title,
            "intervention_name": intervention_name,
            "disease": disease,
            "keyword": keyword,
            "outcome_measure": outcome_measure,
            "criteria": criteria,
            "reference": reference,
            "overall_status": overall_status.lower(),
        }
    )

In [None]:
from src.utils.utils import connect_to_mongoDB

import os
from dotenv import load_dotenv

load_dotenv(".env")

MONGODB_USER = os.getenv("MONGODB_USER")
MONGODB_PWD = os.getenv("MONGODB_PWD")

client = connect_to_mongoDB(MONGODB_USER, MONGODB_PWD)
db = client["ctGov"]
collection = db["heart_failure"]
# studies = collection.find({})

In [None]:
study_pd = pd.DataFrame()

i = 0
for study in tqdm(studies):
    tmp = ct_dict2pd(study)

    study_pd = pd.concat([study_pd, tmp])
    i += 1
    if i > 10:
        break

In [None]:
study_pd.head()

In [None]:
from trial2vec import Trial2Vec

model = Trial2Vec(device="cpu")
model.from_pretrained()

In [None]:
# test_data = {'x': df} # contains trial documents

emb = model.encode({"x": study_pd})  # make inference
# emb.to_csv("./data/ct.trial2vec_embedding.csv")

# # # or just find the pre-encoded trial documents
# emb2 = [model[nct_id] for test_data['x']['nct_id']]

In [None]:
pd.DataFrame(emb, columns=["nctId", "trial2vec"])

In [None]:
for disease in ["heart_failure", "asthma"]:
    collection = db[disease]
    # Set trial2vec to default null
    collection.update_many({}, {"$set": {"trial2vec": []}})
    # Load embedding from file
    emb = pd.read_csv(f"./data/ct.trial2vec_embedding.{disease}.csv", index_col=0)

    for study in emb.columns:
        collection.update_one(
            {"_id": study}, {"$set": {"trial2vec": list(emb[study].values)}}
        )

In [None]:
from biobert_embedding.embedding import BiobertEmbedding

text = "Breast cancers with HER2 amplification have a higher risk of CNS metastasis and poorer prognosis."
# Class Initialization (You can set default 'model_path=None' as your finetuned BERT model path while Initialization)
biobert = BiobertEmbedding()

word_embeddings = biobert.word_vector(text)
sentence_embedding = biobert.sentence_vector(text)

print("Text Tokens: ", biobert.tokens)
# Text Tokens:  ['breast', 'cancers', 'with', 'her2', 'amplification', 'have', 'a', 'higher', 'risk', 'of', 'cns', 'metastasis', 'and', 'poorer', 'prognosis', '.']

print(
    "Shape of Word Embeddings: %d x %d"
    % (len(word_embeddings), len(word_embeddings[0]))
)
# Shape of Word Embeddings: 16 x 768

print("Shape of Sentence Embedding = ", len(sentence_embedding))
# Shape of Sentence Embedding =  768