# ICD-10-CM code extraction
https://www.cdc.gov/nchs/icd/icd-10-cm/files.html
* filename ``icd10cm-codes-April-2025.txt``
* taken from ``ftp.cdc.gov - /pub/Health_Statistics/NCHS/Publications/ICD10CM/2025-Update/`` **2307694 Code-desciptions-April-2025.zip**

In [1]:
import pandas as pd

file_path = '/Users/3141562lar/Desktop/TUM/2425_SS/DI-LAB/2. ICD coding/icd10cm-codes-April-2025.txt'

codes = []
descriptions = []

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        # Strip the line of trailing spaces/newlines
        line = line.strip()
        # The code is the first up to 8 characters (including possible dots)
        # Then the description is the rest after the first whitespace following the code
        # Split by whitespace only once:
        parts = line.split(maxsplit=1)
        if len(parts) == 2:
            code, desc = parts
            codes.append(code)
            descriptions.append(desc)
        else:
            # If the line doesn't split well, handle it (e.g., missing description)
            codes.append(parts[0])
            descriptions.append('')

df_icd = pd.DataFrame({'Code': codes, 'Description': descriptions})

df_icd.head()

Unnamed: 0,Code,Description
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol..."
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor"
2,A009,"Cholera, unspecified"
3,A0100,"Typhoid fever, unspecified"
4,A0101,Typhoid meningitis


## ICD-10-CM codes dataset exploration

In [2]:
df_icd.shape

(74260, 2)

In [3]:
# 1. Compute lengths of the codes
df_icd['Code_Length'] = df_icd['Code'].str.len()

# 2. Get the distribution of code lengths
length_distribution = df_icd['Code_Length'].value_counts().sort_index()
print("Code length distribution:")
print(length_distribution)

Code length distribution:
Code_Length
3      216
4     5395
5     7084
6    10387
7    51178
Name: count, dtype: int64


In [4]:
# 3. Find the minimum length
min_length = df_icd['Code_Length'].min()
print(f"\nShortest code length: {min_length}")

# 4. Extract the sub-dataframe with shortest code length
shortest_codes_df = df_icd[df_icd['Code_Length'] == min_length]
print("\nSubset of codes with shortest length:")
print(shortest_codes_df)


Shortest code length: 3

Subset of codes with shortest length:
      Code                                        Description  Code_Length
78     A09  Infectious gastroenteritis and colitis, unspec...            3
198    A33                                 Tetanus neonatorum            3
199    A34                                Obstetrical tetanus            3
200    A35                                      Other tetanus            3
276    A46                                         Erysipelas            3
...    ...                                                ...          ...
72959  Z08  Encounter for follow-up examination after comp...            3
72960  Z09  Encounter for follow-up examination after comp...            3
73099  Z21  Asymptomatic human immunodeficiency virus [HIV...            3
73120  Z23                         Encounter for immunization            3
73583  Z66                                 Do not resuscitate            3

[216 rows x 3 columns]


### Incorporating chapter information to ICD-10-CM codes
**NOTE:** The ``chapter_ranges`` dictionary has been created by ChatGPT. A quick sanity check shows that it seems to be correct, but it would be more reliable if we got the chapter information from an official source (I couldn't find it anywhere as downloadable material).

In [5]:
import pandas as pd

# Define ranges
chapter_ranges = [
    ("A00", "B99", "I Certain infectious and parasitic diseases"),
    ("C00", "D49", "II Neoplasms"),
    ("D50", "D89", "III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism"),
    ("E00", "E89", "IV Endocrine, nutritional and metabolic diseases"),
    ("F01", "F99", "V Mental and behavioural disorders"),
    ("G00", "G99", "VI Diseases of the nervous system"),
    ("H00", "H59", "VII Diseases of the eye and adnexa"),
    ("H60", "H95", "VIII Diseases of the ear and mastoid process"),
    ("I00", "I99", "IX Diseases of the circulatory system"),
    ("J00", "J99", "X Diseases of the respiratory system"),
    ("K00", "K95", "XI Diseases of the digestive system"),
    ("L00", "L99", "XII Diseases of the skin and subcutaneous tissue"),
    ("M00", "M99", "XIII Diseases of the musculoskeletal system and connective tissue"),
    ("N00", "N99", "XIV Diseases of the genitourinary system"),
    ("O00", "O9A", "XV Pregnancy, childbirth and the puerperium"),
    ("P00", "P96", "XVI Certain conditions originating in the perinatal period"),
    ("Q00", "Q99", "XVII Congenital malformations, deformations and chromosomal abnormalities"),
    ("R00", "R99", "XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified"),
    ("S00", "T88", "XIX Injury, poisoning and certain other consequences of external causes"),
    ("V00", "Y99", "XX External causes of morbidity and mortality"),
    ("Z00", "Z99", "XXI Factors influencing health status and contact with health services"),
    ("U00", "U85", "XXII Codes for special purposes"),
]

# Convert ICD codes to comparable format (e.g. A010 -> A01)
def normalize_code(code):
    return code.replace(".", "").ljust(3, "0")[:3]

def assign_chapter(code):
    norm = normalize_code(code)
    for start, end, chapter in chapter_ranges:
        if start <= norm <= end:
            return chapter
    return "Unclassified"

# Add chapter to df_icd
df_icd["ICD_Chapter"] = df_icd["Code"].apply(assign_chapter)


In [6]:
df_icd.head()

Unnamed: 0,Code,Description,Code_Length,ICD_Chapter
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol...",4,I Certain infectious and parasitic diseases
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor",4,I Certain infectious and parasitic diseases
2,A009,"Cholera, unspecified",4,I Certain infectious and parasitic diseases
3,A0100,"Typhoid fever, unspecified",5,I Certain infectious and parasitic diseases
4,A0101,Typhoid meningitis,5,I Certain infectious and parasitic diseases


In [7]:
import plotly.express as px
import pandas as pd

# Count codes per chapter
chapter_counts = df_icd["ICD_Chapter"].value_counts().reset_index()
chapter_counts.columns = ["ICD_Chapter", "Count"]

# Calculate percentage
total = chapter_counts["Count"].sum()
chapter_counts["Percentage"] = 100 * chapter_counts["Count"] / total

# Custom label: chapter name + percentage (rounded to 1 decimal)
chapter_counts["Legend_Label"] = chapter_counts.apply(
    lambda row: f"{row['ICD_Chapter']} ({row['Percentage']:.1f}%)", axis=1
)

# Create pie chart (using legend label instead of ICD_Chapter)
fig = px.pie(
    chapter_counts,
    values="Count",
    names="Legend_Label",
    title="ICD-10 Code Distribution by Chapter",
    hole=0.3
)

# Hide percent in pie chart itself
fig.update_traces(hoverinfo='label+percent+name', textinfo='none')

# Show plot
fig.show()