# Dimensionality Reduction and Clustering Tests

**Data Science with AIML**<br>
MITES Summer 2025<br>
2025-07-16

My attempts at **dimensionality reduction**, and **clustering** of the OpenPayments Medical Data with HDBSCAN and Keppler Mapper 

These are the Python imports we're using:

In [8]:
from pathlib import Path
import time
from statistics import median_high as median
import warnings
import random
warnings.simplefilter(action="ignore", category=(UserWarning, FutureWarning))
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer



In [12]:
def remove_nan(PATH_TO_FILE):
    df = pd.read_csv(PATH_TO_FILE, low_memory = False)

    pd.reset_option('display.max_rows')
    pd.options.display.max_columns = None

    #Counts the number of Nans under each column and displays it in a list
    counts_list = []
    for name, current_column in df.items():
        count = 0
        for value in current_column:
            if type(value) is float:
                if math.isnan(value):
                    count += 1
        counts_list.append(count)

        
    #Take the list of number of Nans for each columns
    #And convert that list to a list of percentages of how much of each column  is NaN
    percentages_list=[]
    for current_count in counts_list:
        percentages_list.append(current_count / len(df) * 100)
        
    #Format those percentages to remove extraneous decimal points
    formatted_percentage_list = ["{:.1f}".format(percentage) for percentage in percentages_list]

    #Find out which columns must be removed, and make a cleaned data frame 
    list_of_column_names = []
    for index, percentage in enumerate(formatted_percentage_list):
        if float(percentage) > 80.0: #Percentage Threshold of whether the column should be deleted
            list_of_column_names.append(df.columns[index])

    cleaned_df = df.copy()
    cleaned_df.drop(columns=list_of_column_names, inplace=True)



    # print (len(df.columns))
    # print (len(cleaned_df.columns))
    # print("\n \n ------------------Headers in cleaned data set------------------ \n ")
    # print(list(cleaned_df.columns))

    return cleaned_df


PATH_TO_FILE = "C:/Users/mjkuo/Documents/MITES_Summer_ML/MLFP/2024HealthcareFiles/OP_DTL_RSRCH_PGYR2024_P06302025_06162025.csv"
cleaned_df = remove_nan(PATH_TO_FILE)
cleaned_df

Unnamed: 0,Change_Type,Covered_Recipient_Type,Noncovered_Recipient_Entity_Name,Recipient_Primary_Business_Street_Address_Line1,Recipient_City,Recipient_State,Recipient_Zip_Code,Recipient_Country,Principal_Investigator_1_Covered_Recipient_Type,Principal_Investigator_1_Profile_ID,Principal_Investigator_1_NPI,Principal_Investigator_1_First_Name,Principal_Investigator_1_Middle_Name,Principal_Investigator_1_Last_Name,Principal_Investigator_1_Business_Street_Address_Line1,Principal_Investigator_1_Business_Street_Address_Line2,Principal_Investigator_1_City,Principal_Investigator_1_State,Principal_Investigator_1_Zip_Code,Principal_Investigator_1_Country,Principal_Investigator_1_Primary_Type_1,Principal_Investigator_1_Specialty_1,Principal_Investigator_1_License_State_code1,Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country,Related_Product_Indicator,Covered_or_Noncovered_Indicator_1,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_1,Product_Category_or_Therapeutic_Area_1,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1,Associated_Drug_or_Biological_NDC_1,Total_Amount_of_Payment_USDollars,Date_of_Payment,Form_of_Payment_or_Transfer_of_Value,Preclinical_Research_Indicator,Delay_in_Publication_Indicator,Name_of_Study,Dispute_Status_for_Publication,Record_ID,Program_Year,Payment_Publication_Date,ClinicalTrials_Gov_Identifier
0,ADD,Covered Recipient Teaching Hospital,,1935 Medical District Dr,DALLAS,TX,75235,United States,Covered Recipient Physician,1377130.0,1.528432e+09,MOHAMMAD,TARIQUE,HUSSAIN,5323 HARRY HINES BLVD,,DALLAS,TX,75390-7201,United States,Medical Doctor,Allopathic & Osteopathic Physicians|Pediatrics...,TX,Philips North America LLC,100000000198,Philips North America LLC,MA,United States,Yes,Covered,Device,Medical Device,(7884) MRLinac,,18720.00,02/15/2024,In-kind items and services,No,No,WF-010421 Improved MR-guided catheter tracking...,Yes,1123505503,2024,06/30/2025,
1,ADD,Covered Recipient Teaching Hospital,,1 Medical Center Drive,Morgantown,WV,26506,United States,Covered Recipient Physician,20800.0,1.457563e+09,ASHKAN,,EMADI,22 S GREENE ST,S9D,BALTIMORE,MD,21201-1544,United States,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,MD,Servier BioInnovation,100001146889,Servier BioInnovation,MA,United States,Yes,Covered,Drug,Oncology,Asparlas,72694-515-01,54146.64,10/03/2024,In-kind items and services,No,No,CALASPARGASE PEGOL IN ADULTS WITH ALL,No,1153017203,2024,06/30/2025,
2,NEW,Non-covered Recipient Entity,St. Francis Medical Institute - Clinedge - PPDS,802 North Belcher Road,Clearwater,FL,33765,United States,Covered Recipient Physician,299971.0,1.629005e+09,Francis,J,Averill,804 N Belcher Rd,,Clearwater,FL,33765,United States,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,FL,"Insmed, Inc.",100000461811,"Insmed, Inc.",NJ,United States,Yes,Covered,Drug,Respiratory,Arikayce,71558-590-28,1380.60,04/26/2024,Cash or cash equivalent,No,No,"A Randomized, Double-Blind, Placebo-Controlled...",No,1067598064,2024,06/30/2025,NCT04677569
3,NEW,Non-covered Recipient Entity,St. Francis Medical Institute - Clinedge - PPDS,802 North Belcher Road,Clearwater,FL,33765,United States,Covered Recipient Physician,299971.0,1.629005e+09,Francis,J,Averill,804 N Belcher Rd,,Clearwater,FL,33765,United States,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,FL,"Insmed, Inc.",100000461811,"Insmed, Inc.",NJ,United States,No,,,,,,0.46,09/04/2024,Cash or cash equivalent,No,No,"A Randomized, Double-Blind, Placebo-Controlled...",No,1067598066,2024,06/30/2025,NCT04677569
4,NEW,Non-covered Recipient Entity,St. Francis Medical Institute - Clinedge - PPDS,802 North Belcher Road,Clearwater,FL,33765,United States,Covered Recipient Physician,299971.0,1.629005e+09,Francis,J,Averill,804 N Belcher Rd,,Clearwater,FL,33765,United States,Medical Doctor,Allopathic & Osteopathic Physicians|Internal M...,FL,"Insmed, Inc.",100000461811,"Insmed, Inc.",NJ,United States,No,,,,,,22962.78,08/21/2024,Cash or cash equivalent,No,No,"A Randomized, Double-Blind, Placebo-Controlled...",No,1067598068,2024,06/30/2025,NCT04677569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756188,ADD,Non-covered Recipient Entity,Fisher Scientific,300 Industry Drive,Pittsburgh,PA,15275,United States,Covered Recipient Physician,636083.0,1.588690e+09,MICHAEL,,SUMMERFIELD,5711 SARVIS AVE,SUITE402,RIVERDALE,MD,20737,United States,Medical Doctor,Allopathic & Osteopathic Physicians|Ophthalmology,DC,Glaukos Corporation,100000000102,Glaukos Corporation,CA,United States,Yes,Non-Covered,,,,,104.55,11/27/2024,Cash or cash equivalent,No,No,"A Randomized, Multicenter, Double-Masked, Vehi...",No,1151787151,2024,06/30/2025,
756189,ADD,Non-covered Recipient Entity,OCPC,PO Box 932721,Cleveland,OH,44193,United States,Covered Recipient Physician,636083.0,1.588690e+09,MICHAEL,,SUMMERFIELD,5711 SARVIS AVE,SUITE402,RIVERDALE,MD,20737,United States,Medical Doctor,Allopathic & Osteopathic Physicians|Ophthalmology,DC,Glaukos Corporation,100000000102,Glaukos Corporation,CA,United States,Yes,Non-Covered,,,,,348.24,11/13/2024,Cash or cash equivalent,No,No,"A Randomized, Multicenter, Double-Masked, Vehi...",No,1151787153,2024,06/30/2025,
756190,ADD,Non-covered Recipient Entity,"Amazon.com, Inc.",440 Terry Avenue North,Seattle,WA,98109,United States,Covered Recipient Physician,636083.0,1.588690e+09,MICHAEL,,SUMMERFIELD,5711 SARVIS AVE,SUITE402,RIVERDALE,MD,20737,United States,Medical Doctor,Allopathic & Osteopathic Physicians|Ophthalmology,DC,Glaukos Corporation,100000000102,Glaukos Corporation,CA,United States,Yes,Non-Covered,,,,,33.11,10/23/2024,Cash or cash equivalent,No,No,"A Randomized, Multicenter, Double-Masked, Vehi...",No,1151787155,2024,06/30/2025,
756191,ADD,Non-covered Recipient Entity,"Sherpa Clinical Packaging, LLC",6166 Nancy Ridge Road,San Diego,CA,92121,United States,Covered Recipient Physician,636083.0,1.588690e+09,MICHAEL,,SUMMERFIELD,5711 SARVIS AVE,SUITE402,RIVERDALE,MD,20737,United States,Medical Doctor,Allopathic & Osteopathic Physicians|Ophthalmology,DC,Glaukos Corporation,100000000102,Glaukos Corporation,CA,United States,Yes,Non-Covered,,,,,3372.63,06/26/2024,Cash or cash equivalent,No,No,"A Randomized, Multicenter, Double-Masked, Vehi...",No,1151787157,2024,06/30/2025,


## Dimensionality reduction

OK but 300+ dimensions is too much.... Let's tone this down to 2 dimensions and then plot it.

In [13]:
reducer = umap.UMAP(random_state=23)

AttributeError: module 'umap' has no attribute 'UMAP'

In [None]:
reduced_embeddings = reducer.fit_transform(cleaned_df)

print(reduced_embeddings.shape)
reduced_embeddings

In [None]:
# fancy title for your plot
topics = ", ".join(page_titles)
if len(page_titles) > 3: topics += "..."
plot_title = f"Topics: {topics}"

x = reduced_embeddings[:, 0]
y = reduced_embeddings[:, 1]

plt.scatter(x, y)
plt.title(plot_title)
plt.show()

Does this shape look interesting?

## Clustering

Chance are, there's *some* shape to your data, but what are those data points? We can use clustering methods to cluster the data into meaningful groups, and then we'll plot again with some color.

In [None]:
clusterer = hdbscan.HDBSCAN()

In [None]:
clusterer.fit(reduced_embeddings)

labels = [int(i) for i in sorted(set(clusterer.labels_))]

print(labels)

Let's try plotting again, this time coloring the dots by the cluster.

In [None]:
plt.scatter(x, y, c=clusterer.labels_)
plt.title(plot_title)
plt.show()

Cool! But now what even are these clusters about? Remember, each dot represents **one sentence** from your text corpus. Let's randomly sample a few from each cluster to see what they're about.

In [None]:
# organize clusters into list
clusters = []
for label in labels:
    mask = clusterer.labels_ == label
    clusters.append(sentences[mask])

In [None]:
# randomly sample k sentences from each cluster
# (note: same sentence might appear more than once
# for small clusters)
k = 5

for i, cluster in enumerate(clusters):
    print(f"Cluster {i}:")
    random_sentences = random.choices(cluster, k=k)
    for sentence in random_sentences:
        print(f"- {sentence}")
    print()

print(clusters[0])

## Naming the clusters



In [None]:
def clean(text):
    """Standardize the text

    Make lowercase, separate punctuation, fix spacing.

    Args:
        text (str): The text to standardize

    Returns:
        str: The cleaned up text
    """
    text = text.lower()
    text = text.replace("\n", " ")
    text = text.replace("!", "  ")
    text = text.replace("?", "  ")
    text = text.replace(". ", "  ")
    text = text.replace(",", "  ")
    text = text.replace('''"''', '''  ''') # Min-Jae added this
    text = text.replace("(", "  ") # Min-Jae added this
    text = text.replace(")", "  ") # Min-Jae added this
    text = text.replace(" ", "  ") # Min-Jae added this

    while "  " in text:
        text = text.replace("  ", " ")

    return text

def tokenize(text):
    """Clean & tokenize the text

    Args:
        text (str): The text to tokenize

    Returns:
        list[str]: The tokenized text, as a list of str
    """
    text = clean(text)
    tokens = text.split(" ")
    if tokens[-1] == "":
        tokens = tokens[:-1]
    if tokens[0] == "":
        tokens.pop(0)
    return tokens

stop_words = ["the", "and", "is", "are", "of", "in", "a", "to", "as", "or", "such", "for", "at", "was", "that", "their", "can", "with"]

def remove_stop_words(word_list):
    for stop_word in stop_words:
        while stop_word in word_list:
            word_list.remove(stop_word)
    return word_list

In [None]:
# find most common words in each cluster
for i, cluster in enumerate(clusters):
    word_counter = {}
    
    for sentence in cluster:
        cleaned = clean(sentence)
        tokens = tokenize(cleaned)
        unstopped_tokens = remove_stop_words(tokens)
        
        for word in unstopped_tokens:
            if word not in word_counter:
                word_counter[word] = 0
            word_counter[word] += 1
            
    sorted_words = sorted(word_counter, key=lambda word: word_counter[word], reverse=True)
    word_counter = {word: word_counter[word] for word in sorted_words}
    word_df = pd.DataFrame({"Word": word_counter.keys(), "Frequency": word_counter.values()})
    print(f"Cluster {i} Name:")
    print(*word_df.head(3)["Word"].values)
    print()
    