In [2]:
import pandas as pd
import json
import random
from sklearn.model_selection import train_test_split
from datetime import datetime

## Import RadLex codes and descriptions from BioPortal
Link here: https://bioportal.bioontology.org/ontologies/RADLEX

In [3]:
# Load ICD diagnoses dictionary
radlex_file_path = 'RADLEX.csv'  # Replace with your actual file path
radlex_data = pd.read_csv(radlex_file_path)

  radlex_data = pd.read_csv(radlex_file_path)


In [4]:
radlex_data.head(2)

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Obsolete,CUI,Semantic Types,Parents,http://data.bioontology.org/metadata/prefixIRI,http://radlex.org/RID/AAL,...,http://www.radlex.org/RID/Surrounded_by,http://www.radlex.org/RID/Surrounds,http://www.radlex.org/RID/Synonym,http://www.radlex.org/RID/Synonym_German,http://www.radlex.org/RID/Talairach,http://www.radlex.org/RID/Term_type,http://www.radlex.org/RID/Tributary_Of,http://www.radlex.org/RID/UMLS_ID,http://www.radlex.org/RID/UMLS_Term,http://www.radlex.org/RID/Unsanctioned_Term
0,http://www.radlex.org/RID/RID32370,left fourth anterior cervical intertransversarius,,,False,,,http://www.radlex.org/RID/RID32368,RID1:RID32370,,...,,,,,,,,,,
1,http://www.radlex.org/RID/RID21755,trunk of pharyngeal branch of right glossophar...,,,False,,,http://www.radlex.org/RID/RID21754,RID1:RID21755,,...,,,,,,,,,,


In [5]:
radlex_data.shape

(46840, 170)

In [6]:
radlex_data.rename(columns=lambda col: col.replace("http://radlex.org/RID/", "") if col.startswith("http://radlex.org/RID/") else col, inplace=True)
radlex_data.rename(columns=lambda col: col.replace("http://www.radlex.org/RID/", "") if col.startswith("http://www.radlex.org/RID/") else col, inplace=True)

for col in radlex_data.columns:
    print(col)

Class ID
Preferred Label
Synonyms
Definitions
Obsolete
CUI
Semantic Types
Parents
http://data.bioontology.org/metadata/prefixIRI
AAL
Acronym
Anatomical_Site
Anterior_to
Attaches_to
Blood_Supply_of
Bounded_by
Bounds
Branch_Of
CMA_Label
Comment
Constitutional_Part_Of
Contained_In
Contains
Continuous_With
Created
Definition
Distal_to
Domain_string
Drains_Into
External_to
FMAID
Freesurfer
Has_Blood_Supply
Has_Branch
Has_Constitutional_Part
Has_Entrapment_Site
Has_Innervation_Source
Has_insertion
Has_Member
Has_origin
Has_Part
Has_Regional_Part
Inferior_to
Innervates
Insertion_of
JHU_DTI-81
JHU_White-Matter_Tractography_Atlas
Lymphatic_Drainage
Lymphatic_Drainage_Of
May_Be_Caused_By
May_Cause
Member_Of
Misspelling_of_term
name
Origin_of
Part_Of
Posterior_to
Preferred_name
Preferred_Name_for_Obsolete
Preferred_name_German
Projects_From
Projects_To
Proximal_to
Radlex_version_of_class_change
Receives_attachment_from
Receives_Drainage_From
Receives_Input_From
Receives_Projection_From
Regional_P

In [7]:
radlex_data["Class ID"] = radlex_data["Class ID"].str.replace("http://www.radlex.org/RID/", "", regex=False)
radlex_data["Parents"] = radlex_data["Parents"].str.replace("http://www.radlex.org/RID/", "", regex=False)
radlex_data = radlex_data[radlex_data["Obsolete"] == False]

radlex_data.head()

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Obsolete,CUI,Semantic Types,Parents,http://data.bioontology.org/metadata/prefixIRI,AAL,...,Surrounded_by,Surrounds,Synonym,Synonym_German,Talairach,Term_type,Tributary_Of,UMLS_ID,UMLS_Term,Unsanctioned_Term
0,RID32370,left fourth anterior cervical intertransversarius,,,False,,,RID32368,RID1:RID32370,,...,,,,,,,,,,
1,RID21755,trunk of pharyngeal branch of right glossophar...,,,False,,,RID21754,RID1:RID21755,,...,,,,,,,,,,
2,RID35046,blade-of-grass sign,V sign of bone|flame sign,A wedge- or V-shaped radiolucent area typicall...,False,,,RID29023,RID1:RID35046,,...,,,,,,,,,,
3,RID29220,layer of cornea,,,False,,,RID29216,RID1:RID29220,,...,,,,,,,,,,
4,RID14565,nerve to depressor anguli oris,,,False,,,RID14549,RID1:RID14565,,...,,,,,,,,,,


In [8]:
radlex_data.shape

(46840, 170)

In [9]:
duplicates = radlex_data["Class ID"][radlex_data["Class ID"].duplicated()]
print(duplicates)

Series([], Name: Class ID, dtype: object)


In [10]:
import re

def count_rid_links(text):
    if pd.isna(text):
        return 0
    return len(re.findall(r"RID\w+", text))

# Apply the function to count RID links in each row
rid_link_counts = radlex_data["Parents"].apply(count_rid_links)

# Check the distribution of counts
print(rid_link_counts.value_counts())

Parents
1    46838
2        1
0        1
Name: count, dtype: int64


In [11]:
radlex_data["Parent_Count"] = radlex_data["Parents"].apply(count_rid_links)

# Filter the row(s) with two parent links
rows_with_two_parents = radlex_data[radlex_data["Parent_Count"] == 2]

print(rows_with_two_parents["Parents"].values[0])

RID5|RID3


In [12]:
nan_counts = radlex_data.isna().sum()
print(nan_counts)

Class ID                 0
Preferred Label          0
Synonyms             33207
Definitions          43036
Obsolete                 0
                     ...  
Tributary_Of         46840
UMLS_ID              45517
UMLS_Term            46840
Unsanctioned_Term    46840
Parent_Count             0
Length: 171, dtype: int64


In [13]:
all_nan_columns = radlex_data.columns[radlex_data.isna().sum() == len(radlex_data)]
radlex_data = radlex_data.dropna(axis=1, how='all')

print("Dropped columns:", list(all_nan_columns))

Dropped columns: ['CUI', 'Semantic Types', 'Domain_string', 'name', 'SNOMED_Term', 'Term_type', 'Acronym', 'Anatomical_Site', 'Anterior_to', 'Attaches_to', 'Blood_Supply_of', 'Bounded_by', 'Bounds', 'Branch_Of', 'Branch_Part_of', 'CMA_Label', 'Comment', 'Constitutional_Part_Of', 'Contained_In', 'Contains', 'Continuous_With', 'Created', 'Definition', 'Distal_to', 'Domain_string', 'Drains_Into', 'External_to', 'FMAID', 'Has_Blood_Supply', 'Has_Branch', 'Has_Branch_Part', 'Has_Constitutional_Part', 'Has_Entrapment_Site', 'Has_Innervation_Source', 'Has_insertion', 'Has_Member', 'Has_origin', 'Has_Part', 'Has_Regional_Part', 'Inferior_to', 'Innervates', 'Insertion_of', 'JHU_DTI-81', 'JHU_White-Matter_Tractography_Atlas', 'Lymphatic_Drainage', 'Lymphatic_Drainage_Of', 'May_Be_Caused_By', 'May_Cause', 'Member_Of', 'Misspelling_of_term', 'name', 'Origin_of', 'Part_Of', 'Posterior_to', 'Preferred_name', 'Preferred_Name_for_Obsolete', 'Preferred_name_German', 'Projects_From', 'Projects_To', 'Pro

In [14]:
radlex_data.shape
for col in radlex_data.columns:
    print(col)

Class ID
Preferred Label
Synonyms
Definitions
Obsolete
Parents
http://data.bioontology.org/metadata/prefixIRI
AAL
Acronym
Anatomical_Site
Anterior_to
Attaches_to
Blood_Supply_of
Bounded_by
Bounds
Branch_Of
CMA_Label
Comment
Constitutional_Part_Of
Contained_In
Contains
Continuous_With
Created
Definition
Distal_to
Drains_Into
External_to
FMAID
Freesurfer
Has_Blood_Supply
Has_Branch
Has_Constitutional_Part
Has_Entrapment_Site
Has_Innervation_Source
Has_insertion
Has_Member
Has_origin
Has_Part
Has_Regional_Part
Inferior_to
Innervates
Insertion_of
JHU_DTI-81
JHU_White-Matter_Tractography_Atlas
Lymphatic_Drainage
Lymphatic_Drainage_Of
May_Be_Caused_By
May_Cause
Member_Of
Misspelling_of_term
Origin_of
Part_Of
Posterior_to
Preferred_name
Preferred_Name_for_Obsolete
Preferred_name_German
Projects_From
Projects_To
Proximal_to
Radlex_version_of_class_change
Receives_attachment_from
Receives_Drainage_From
Receives_Input_From
Receives_Projection_From
Regional_Part_Of
Related_modality
Replaced_by
Se

## Creating RadLex code-description pairs in English and German

In [15]:
relevant_cols = [
    "Class ID",
    "Preferred Label",
    "Synonyms",
    "Definitions",
    "Acronym",
    "Comment",
    "Definition",
    #"FMAID",
    "JHU_DTI-81",
    "JHU_White-Matter_Tractography_Atlas",
    "Misspelling_of_term",
    "Preferred_name",
    "Preferred_Name_for_Obsolete",
    "Preferred_name_German",
    #"Related_modality",
    #"Replaced_by",
    #"SNOMED_ID",
    "Synonym",
    "Synonym_German",
    #"UMLS_ID",
    #"UMLS_Term",
    "Unsanctioned_Term",
    #"http://www.geneontology.org/formats/oboInOwl#created_by",
    #"http://www.geneontology.org/formats/oboInOwl#creation_date",
    #"AAL",
    #"Freesurfer",
    #"Radlex_version_of_class_change",
    #"Talairach"
]

# Remove duplicates from the list
relevant_cols = list(dict.fromkeys(relevant_cols))  # preserves order and removes duplicates

# Filter only those columns that exist in the current DataFrame
columns_in_df = [col for col in relevant_cols if col in radlex_data.columns]

# Create the new DataFrame
radlex_llm = radlex_data[columns_in_df]

# Check which columns are duplicated
duplicate_cols = radlex_llm.columns.duplicated()
print("Are there duplicate columns?:", duplicate_cols.any())

# List duplicated column names
duplicates = radlex_llm.columns[duplicate_cols].to_list()
print("Duplicated columns:", duplicates)

# Remove duplicate columns, keep first occurrence
radlex_llm = radlex_llm.loc[:, ~radlex_llm.columns.duplicated()]

# Display summary
print(f"New dataframe created with {len(columns_in_df)} columns out of {len(relevant_cols)} requested.")
print(f"Missing columns (not found in DataFrame): {[col for col in relevant_cols if col not in radlex_data.columns]}")


Are there duplicate columns?: True
Duplicated columns: ['Synonym_German']
New dataframe created with 16 columns out of 16 requested.
Missing columns (not found in DataFrame): []


In [16]:
radlex_llm.head(5)

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Acronym,Comment,Definition,JHU_DTI-81,JHU_White-Matter_Tractography_Atlas,Misspelling_of_term,Preferred_name,Preferred_Name_for_Obsolete,Preferred_name_German,Synonym,Synonym_German,Unsanctioned_Term
0,RID32370,left fourth anterior cervical intertransversarius,,,,,,,,,left fourth anterior cervical intertransversarius,,linker Musculus intertransversarius anterior c...,,,
1,RID21755,trunk of pharyngeal branch of right glossophar...,,,,,,,,,trunk of pharyngeal branch of right glossophar...,,Truncus des Ramus pharyngeus des rechten Nervu...,,,
2,RID35046,blade-of-grass sign,V sign of bone|flame sign,A wedge- or V-shaped radiolucent area typicall...,,http://radiology.rsna.org/cgi/content/full/221...,A wedge- or V-shaped radiolucent area typicall...,,,,blade-of-grass sign,,blade-of-grass sign (EN),V sign of bone|flame sign,,
3,RID29220,layer of cornea,,,,,,,,,layer of cornea,,Schicht der Cornea,,,
4,RID14565,nerve to depressor anguli oris,,,,,,,,,nerve to depressor anguli oris,,Nerv zum Musculus depressor anguli oris,,,


In [17]:
radlex_llm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46840 entries, 0 to 46839
Data columns (total 16 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   Class ID                             46840 non-null  object
 1   Preferred Label                      46840 non-null  object
 2   Synonyms                             13633 non-null  object
 3   Definitions                          3804 non-null   object
 4   Acronym                              133 non-null    object
 5   Comment                              1866 non-null   object
 6   Definition                           3804 non-null   object
 7   JHU_DTI-81                           49 non-null     object
 8   JHU_White-Matter_Tractography_Atlas  16 non-null     object
 9   Misspelling_of_term                  33 non-null     object
 10  Preferred_name                       45928 non-null  object
 11  Preferred_Name_for_Obsolete          939 

In [18]:
#radlex_llm["Freesurfer"].describe(include='all')
#unique_values = radlex_llm["Freesurfer"].dropna()
#print(unique_values)

In [19]:
# Select rows where neither column is NaN
both_non_nan = radlex_data["Definitions"].notna() & radlex_data["Definition"].notna()

# Compare those rows
differs = radlex_data.loc[both_non_nan, "Definitions"] != radlex_data.loc[both_non_nan, "Definition"]

if differs.any():
    print(f"Columns differ in {differs.sum()} rows where both are non-NaN.")
    # Show some differing rows for inspection
    print(radlex_data.loc[both_non_nan & differs, ["Definitions", "Definition"]])
else:
    print("No differences found in non-NaN entries. Dropping 'Definition' column.")
    radlex_data = radlex_data.drop(columns=["Definition"])

No differences found in non-NaN entries. Dropping 'Definition' column.


In [20]:
radlex_llm.describe(include='all')

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Acronym,Comment,Definition,JHU_DTI-81,JHU_White-Matter_Tractography_Atlas,Misspelling_of_term,Preferred_name,Preferred_Name_for_Obsolete,Preferred_name_German,Synonym,Synonym_German,Unsanctioned_Term
count,46840,46840,13633,3804,133,1866,3804,49,16,33,45928,939,45171,13633,2020,54
unique,46840,46830,13588,3748,132,995,3748,49,16,33,45918,937,44408,13588,1995,54
top,RID32370,plexus branch of anterior branch of left later...,mass,Reactive vertebral body modifications associat...,MI,duplicate,Reactive vertebral body modifications associat...,Cingulum (cingulate gyrus) R,Corticospinal tract L,Segund fracture,molecular imaging,middle para-aortic lymph node group,linker Lemniscus medialis,mass,Mamille,casting calcification
freq,1,2,4,4,2,297,4,1,1,1,2,2,3,4,8,1


## Export RadLex code-description pairs in English

In [21]:
radlex_llm

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Acronym,Comment,Definition,JHU_DTI-81,JHU_White-Matter_Tractography_Atlas,Misspelling_of_term,Preferred_name,Preferred_Name_for_Obsolete,Preferred_name_German,Synonym,Synonym_German,Unsanctioned_Term
0,RID32370,left fourth anterior cervical intertransversarius,,,,,,,,,left fourth anterior cervical intertransversarius,,linker Musculus intertransversarius anterior c...,,,
1,RID21755,trunk of pharyngeal branch of right glossophar...,,,,,,,,,trunk of pharyngeal branch of right glossophar...,,Truncus des Ramus pharyngeus des rechten Nervu...,,,
2,RID35046,blade-of-grass sign,V sign of bone|flame sign,A wedge- or V-shaped radiolucent area typicall...,,http://radiology.rsna.org/cgi/content/full/221...,A wedge- or V-shaped radiolucent area typicall...,,,,blade-of-grass sign,,blade-of-grass sign (EN),V sign of bone|flame sign,,
3,RID29220,layer of cornea,,,,,,,,,layer of cornea,,Schicht der Cornea,,,
4,RID14565,nerve to depressor anguli oris,,,,,,,,,nerve to depressor anguli oris,,Nerv zum Musculus depressor anguli oris,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46835,RID11044,monorail deployment,,,,,,,,,monorail deployment,,Monorail-Einsatz,,,
46836,RID17476,c5 segment of right gracile fasciculus of spin...,,,,,,,,,c5 segment of right gracile fasciculus of spin...,,Segment C5 des rechten Fasciculus gracilis des...,,,
46837,RID36158,left inferior orbital vein,,,,,,,,,left inferior orbital vein,,linke Vena orbitalis inferior,,,
46838,RID44006,subscapular vein,vena subscapularis,,,,,,,,subscapular vein,,vena subscapularis,vena subscapularis,,


In [22]:
# Filter rows where Class ID is NOT a string or doesn't start with 'RID'
invalid_ids = radlex_llm[~radlex_llm["Class ID"].astype(str).str.startswith("RID")]

# Show them
invalid_ids

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Acronym,Comment,Definition,JHU_DTI-81,JHU_White-Matter_Tractography_Atlas,Misspelling_of_term,Preferred_name,Preferred_Name_for_Obsolete,Preferred_name_German,Synonym,Synonym_German,Unsanctioned_Term
20724,RadLex_term,RadLex term,,,,,,,,,,,,,,
31325,Non-RadLex_term,Non-RadLex term,,,,,,,,,,,,,,


In [27]:
# Filter rows where Class ID is NOT a string or doesn't start with 'RID' (THESE ROWS ARE OBSOLOTE)
invalid_descriptions = radlex_llm[radlex_llm["Preferred Label"].astype(str).str.startswith("RID")]

# Show them
invalid_descriptions

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Acronym,Comment,Definition,JHU_DTI-81,JHU_White-Matter_Tractography_Atlas,Misspelling_of_term,Preferred_name,Preferred_Name_for_Obsolete,Preferred_name_German,Synonym,Synonym_German,Unsanctioned_Term
31,RID10853,RID10853,,,,"Children moved to ""ultrasound transducer""; not...",,,,,,ultrasound transducer shape,,,,
84,RID469,RID469,,,,duplicate,,,,,,left lower quadrant of abdomen,,,,
138,RID28845,RID28845,,,,no longer needed,,,,,,xray imaging procedure,,,,
160,RID470,RID470,,,,duplicate,,,,,,left upper quadrant of abdomen,,,,
325,RID678,RID678,,,,seems to be an erroneous term for a branch of ...,,,,,,angular artery,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46564,RID15043,RID15043,,,,duplicate,,,,,,medial proper palmar digital branch of median ...,,,,
46601,RID12602,RID12602,,,,"Deleted all ""procedure step"" terms. These wer...",,,,,,intrinsic uniformity quality control step,,,,
46692,RID32047,RID32047,,,,duplicate,,,,,,left lateral aortic lymph node,,,,
46776,RID15039,RID15039,,,,duplicate,,,,,,medial proper palmar digital branch of median ...,,,,


In [28]:
radlex_llm = radlex_llm[radlex_llm["Class ID"].astype(str).str.startswith("RID")]
radlex_llm = radlex_llm[~radlex_llm["Preferred Label"].astype(str).str.startswith("RID")]
radlex_llm.shape

(45928, 16)

In [29]:
# Select and rename the columns
radlex_df = radlex_llm[["Class ID", "Preferred Label"]].rename(
    columns={"Class ID": "radlex_code", "Preferred Label": "preferred_description"}
)

# Save to CSV
radlex_df.to_csv("d_radlex_entities.csv", index=False)
