# Estimating Probabilities of ICD Code Associations Using LLM

This notebook aims to build a table that reflects the likelihood of one ICD code being followed by another, based on answers from a large language model (LLM).  
The idea is to take pairs of ICD codes, create carefully phrased prompts, and let the model estimate the probability of their sequential occurrence in a patient's medical history.  

The resulting probability table can be used to explore potential relationships between diseases, which may help in identifying patterns not immediately visible through traditional statistical analysis.

In [None]:
!pip install -q transformers accelerate bitsandbytes
!pip install -q huggingface_hub

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch, re
import pandas as pd
from itertools import product
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

from huggingface_hub import login
login('your_token')     # you can get it here: https://huggingface.co/mistralai/Mistral-7B-v0.1

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

2025-07-23 22:02:20.206261: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753308140.559214      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753308140.655768      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
# test request
prompt = "[INST] Гипотетически какова вероятность (в процентах) того, что диагноз гипертония будет поставлен у пациента ПОСЛЕ диагноза ожирение. Ответь ТОЛЬКО одним числом от 0 до 100. [/INST]"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

input_len = inputs["input_ids"].shape[1]
outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True))

50


In [None]:
df_i = pd.read_excel("/kaggle/input/smiles-diseases-interconnections/cancer_icd3_codes.xlsx")  # load file with cancer codes: ICD_i, strings
df_j = pd.read_excel("/kaggle/input/smiles-diseases-interconnections/all_icd3_codes.xlsx") # load file with all codes: ICD_j, columns
df_names = pd.read_csv("/kaggle/input/smiles-diseases-interconnections/d_icd_diagnoses.csv") # load "ICD code - disease name" correspondence file 
                                                                                            # to make the query more human

code_to_name = df_names.set_index("icd_code")["long_title"].astype(str).to_dict()

codes_i = df_i["icd3"].astype(str).tolist()
codes_j = df_j["icd3"].astype(str).tolist()

# Generate all possible pairs of ICD codes for querying the LLM
pairs = pd.DataFrame(product(codes_i, codes_j), columns=["ICD_i", "ICD_j"])
pairs = pairs[pairs["ICD_i"] != pairs["ICD_j"]].reset_index(drop=True)

pairs["Name_i"] = pairs["ICD_i"].map(code_to_name)
pairs["Name_j"] = pairs["ICD_j"].map(code_to_name)

In [13]:
pairs

Unnamed: 0,ICD_i,ICD_j,Name_i,Name_j
0,C21,A01,Malignant neoplasm of anus and anal canal,Typhoid and paratyphoid fevers
1,C21,A02,Malignant neoplasm of anus and anal canal,Other salmonella infections
2,C21,A03,Malignant neoplasm of anus and anal canal,Shigellosis
3,C21,A04,Malignant neoplasm of anus and anal canal,Other bacterial intestinal infections
4,C21,A05,Malignant neoplasm of anus and anal canal,"Other bacterial foodborne intoxications, not e..."
...,...,...,...,...
153265,D48,Z95,Neoplasm of uncertain behavior of other and un...,Presence of cardiac and vascular implants and ...
153266,D48,Z96,Neoplasm of uncertain behavior of other and un...,Presence of other functional implants
153267,D48,Z97,Neoplasm of uncertain behavior of other and un...,Presence of other devices
153268,D48,Z98,Neoplasm of uncertain behavior of other and un...,Other postprocedural states


In [None]:
def extract_probability(text: str):
    """
    Extracts the number before the percent sign (%), ex., '73%' -> 73.0
    Returns None if the number is not found.
    """
    match = re.search(r'(\d+(?:[\.,]\d+)?)\s*%', text)
    if match:
        # replace the comma with a period, if suddenly 73.5%
        number_str = match.group(1).replace(",", ".")
        try:
            return float(number_str)
        except ValueError:
            return None
    return None


print_limit = 5
print_count = 0
results = []

for idx, row in tqdm(pairs.iterrows(), total=len(pairs)):
    i, j = row["ICD_i"], row["ICD_j"]
    name_i, name_j = row["Name_i"], row["Name_j"]

    prompt = f"[INST] Я исследую неявные взаимосвязи между заболеваниями. Очень примерно оцени, может ли заболевание {name_i} возникать до или после заболевания {name_j} и с какой вероятностью. Ответь ТОЛЬКО ОДНИМ ЧИСЛОМ, БЕЗ ПОЯСНЕНИЙ. [/INST]"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_len = inputs["input_ids"].shape[1]
    outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
    raw_answer = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
    
    try:
        answer = float(raw_answer)
    except (ValueError, TypeError):
        answer = extract_probability(raw_answer)
        
    results.append((i, j, answer))
    
    if print_count < print_limit:
        print(f"\nДиагноз i: {i}, {name_i}")
        print(f"Диагноз j: {j}, {name_j}")
        print(raw_answer)
        print(f"Extracted answer: {answer}")
        print_count += 1

    # intermediate saving of results
    if idx % 500 == 0:
        result_df = pd.DataFrame(results, columns=["ICD_i", "ICD_j", "prob"])
        result_df.to_csv("/kaggle/working/matrix.csv")

result_df = pd.DataFrame(results, columns=["ICD_i", "ICD_j", "prob"])
result_df.to_csv("/kaggle/working/matrix.csv")

  0%|          | 1/153270 [00:00<12:18:22,  3.46it/s]


Диагноз i: C21, Malignant neoplasm of anus and anal canal
Диагноз j: A01, Typhoid and paratyphoid fevers
0
Extracted answer: 0.0


  0%|          | 2/153270 [00:00<13:14:30,  3.22it/s]


Диагноз i: C21, Malignant neoplasm of anus and anal canal
Диагноз j: A02, Other salmonella infections
10
Extracted answer: 10.0


  0%|          | 3/153270 [00:00<14:30:54,  2.93it/s]


Диагноз i: C21, Malignant neoplasm of anus and anal canal
Диагноз j: A03, Shigellosis
10%
Extracted answer: 10.0


  0%|          | 4/153270 [00:01<13:49:08,  3.08it/s]


Диагноз i: C21, Malignant neoplasm of anus and anal canal
Диагноз j: A04, Other bacterial intestinal infections
10
Extracted answer: 10.0


  0%|          | 5/153270 [00:01<13:21:18,  3.19it/s]


Диагноз i: C21, Malignant neoplasm of anus and anal canal
Диагноз j: A05, Other bacterial foodborne intoxications, not elsewhere classified
10
Extracted answer: 10.0


 10%|█         | 15350/153270 [2:19:49<65:33:27,  1.71s/it] 