In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import random
import numpy as np
import pandas as pd

torch.cuda.empty_cache()

In [22]:
# Fixando sementes para reprodutibilidade
seed = 123
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# Se necessário, forçar algoritmos determinísticos:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True,
)

# quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)

model_name = "aaditya/OpenBioLLM-Llama3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [23]:
def get_ner_summary(text):
    prompt = f"Extrai entidades NER dos critério de forma estruturada:\n\n{text}\n\nOutput desejado em JSON. Deduplica os resultados se os mesmos estiverem escritos em duas línguas diferentes, Não repitas o prompt, retorna apenas o json final"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=200, temperature=0.2)
    result = tokenizer.decode(output[0], skip_special_tokens=True)
    return result

In [25]:
import json
import torch
import random
import numpy as np

# Fixar sementes para reprodutibilidade
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# such as "age", "ischemic_event", "onset_of_symptoms", "venous_access", "consent", "NIHSS", etc, always
def test_summary():
    prompt = (
        """
        You are an assistant specialized in extracting and summarizing inclusion criteria for clinical trials. From the following list of criteria, extract the main summarized criteria and return a JSON where a representative label associated to each criterion is corresponded to its summarized criterion (like 'representative label: criterion'). Use clear labels based in the corresponding criterion. If a criterion does not perfectly fit a known label, create an appropriate succinct label. Return only a complete and valid JSON, without any additional text or repetition of the prompt. if any criterion is in portuguese, translate it to english before extracting the 'label: criterion' pair.
        List of criteria:
        ["Age 18-80 years", "Have suffered an acute hemispheric ischemic stroke attributable to injury within the territory supplied by the Middle Cerebral Artery (MCA)", "Symptomatic arterial territory is recanalized at the time of randomization", "Onset of an acute ischemic stroke that can have full clinical, imagiological and bone marrow collection within 7 days after the onset of symptoms. Onset is defined as the time that the subject was last seen in a normal state, or bedtime for unwitnessed strokes occurring during sleep", "Have readily accessible peripheral venous access blood sampling", "Have the ability to understand the requirements of the study and be willing to provide written informed consent, as evidenced by signature on an informed consent document (which has been submitted and approved by the local Ethical Committee), and agree to perform the required assessments. In the event of incapacitated subjects, informed consent will be sought from a legally acceptable representative", "NIHSS of at least 6 at the time of study inclusion"]
        \n
        Answer:
        """
    )
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(
        **inputs,
        max_new_tokens=20000,
        temperature=0.5,
        do_sample=True
    )
    result = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extrair o JSON: remover tudo até "Aswer:" e o marcador "END_JSON"
    marker = "Aswer:"
    start_index = result.find(marker)
    if start_index != -1:
        json_str = result[start_index + len(marker):].strip()
    else:
        json_str = result.strip()

    # Remover o marcador final "END_JSON", se presente
    end_marker = "END_JSON"
    if json_str.endswith(end_marker):
        json_str = json_str[:-len(end_marker)].strip()

    # Tentar interpretar o restante como JSON
    try:
        json_obj = json.loads(json_str)
    except json.JSONDecodeError as e:
        # Se ocorrer erro, pode-se logar ou retornar o texto bruto para análise
        print("Erro ao interpretar JSON:", e)
        json_obj = json_str

    return json_obj, output

# Testa a função
resultado_json, _ = test_summary()
print(resultado_json)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

tensor([[ 12840,    374,   1370,  68323,    309,    337,     30, 128001]],
       device='cuda:0')

In [18]:
df = pd.read_parquet('../sources/full_df.parquet')

In [19]:
df.sample(5)

Unnamed: 0,eudract_nr,Sponsor,therapeutic_area,Gender_F,Gender_M,inclusion_crt,exclusion_crt,status,url,trial_design.Comparator_description,...,title,Protocol,start_date,trial_Early_Phase_I,trial_Phase_I,trial_Phase_II,trial_Phase_III,trial_Phase_IV,condition,masking_OPEN
1179,2015-003471-30,Ipsen Innovation,Not possible to specify,True,True,"[""The following inclusion criteria will be ass...","[""The following exclusion criteria will be ass...",Completed,https://www.clinicaltrialsregister.eu/ctr-sear...,,...,"A Phase III, Multicentre, Randomised, Double B...",CONTENT1,2016-03-31,False,False,False,True,False,Urinary Incontinence | Overactive Bladder,False
2642,2024-512912-23-00,Bluepharma Industria Farmaceutica S.A.,"[""Not possible to specify""]",True,True,,,2,https://euclinicaltrials.eu/ctis-public-api/re...,,...,Bioequivalence of Lenvatinib 10 mg Capsules ve...,BLCL-LEN-FDA-02,2024-06-21,False,True,False,False,False,No medical condition,
691,2010-024252-29,Boehringer Ingelheim France,Diseases [C] - Respiratory Tract Diseases [C08],True,True,"[""Age >= 40 years;"", ""IPF diagnosed, according...","[""Laboratory parameters (AST, ALT > 1.5 x ULN;...",Completed,https://www.clinicaltrialsregister.eu/ctr-sear...,,...,"A 52 Weeks, Double Blind, Randomized, Placebo-...",1199.34,2011-03-15,False,False,False,True,False,Pulmonary Fibrosis,False
4529,,EMD Serono,,True,True,,,COMPLETED,,,...,"A Phase III, Randomized, Double-blind, Placebo...",ORACLE MS,2008-12-31,False,False,False,True,False,Multiple Sclerosis,False
2295,2023-505061-82-00,AbbVie,,True,True,,,ACTIVE_NOT_RECRUITING,,,...,"A Phase 3 Randomized, Placebo-controlled, Doub...",Up-AA,2023-10-11,False,False,False,True,False,Alopecia Areata,False


In [20]:
# Aplica função ao dataframe original, criando novas colunas automaticamente
get_ner_summary('["Age 18-80 years", "Have suffered an acute hemispheric ischemic stroke attributable to injury within the territory supplied by the Middle Cerebral Artery (MCA)", "Symptomatic arterial territory is recanalyzed at the time of randomization", "Onset of an acute ischemic stroke that can have full clinical, imagiological and bone marrow collection within 7 days after the onset of symptoms. Onset is defined as the time that the subject was last seen in a normal state, or bedtime for unwitnessed strokes occurring during sleep", "Have readily accessible peripheral venous access blood sampling", "Have the ability to understand the requirements of the study and be willing to provide written informed consent, as evidenced by signature on an informed consent document (which has been submitted and approved by the local Ethical Committee),and agree to perform the required assessments. In the event of incapacitated subjects, informed consent will be sought from a legally acceptable representative", "NIHSS of at least 6 at the time of study inclusion"]')

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'Extrai entidades NER dos critério de forma estruturada:\n\n["Age 18-80 years", "Have suffered an acute hemispheric ischemic stroke attributable to injury within the territory supplied by the Middle Cerebral Artery (MCA)", "Symptomatic arterial territory is recanalyzed at the time of randomization", "Onset of an acute ischemic stroke that can have full clinical, imagiological and bone marrow collection within 7 days after the onset of symptoms. Onset is defined as the time that the subject was last seen in a normal state, or bedtime for unwitnessed strokes occurring during sleep", "Have readily accessible peripheral venous access blood sampling", "Have the ability to understand the requirements of the study and be willing to provide written informed consent, as evidenced by signature on an informed consent document (which has been submitted and approved by the local Ethical Committee),and agree to perform the required assessments. In the event of incapacitated subjects, informed consen

In [15]:
df['inclusion_crt'].head(5)

2851                                                 None
2788                                                 None
2967                                                 None
2742    ["Age 18-80 years", "Have suffered an acute he...
2749                                                 None
Name: inclusion_crt, dtype: object

In [None]:
# Expandir resultados em novas colunas
df_expanded = pd.json_normalize(df['ner_result'])
df_final = pd.concat([df, df_expanded], axis=1)

df_final.to_csv('resultado_final.csv', index=False)