### Loading data 

In [None]:
import pandas as pd 
hf_copd_df3.to_csv("hf_copd_df_topic_model.csv")

### Getting Average Visits, Number of Individuals, and Documents in each disease

In [212]:
# Calculate the average of the "visits" column
average_visits = hf_copd_df3["visits"].mean()

# Display the average visits
print("Average visits:", average_visits)

Average visits: 2.0216034171184494


In [213]:
# Display the filtered rows
print(hf_copd_df3['disease_category'].value_counts())

fluid overload/heart failure    6173
copd/emphysema                  6001
Name: disease_category, dtype: int64


In [214]:
# Count the number of rows for each patient_id
#print(hf_df['patient_id'].value_counts())

hf_copd_df3['patient_id'].nunique()

7532

## Making Sentence embeddings

In [221]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

model_sentence = SentenceTransformer('all-mpnet-base-v2')

hf_copd_df3.reset_index(drop=True, inplace=True)

#embeddings = model.encode(disease_df_analysis['reason_clean'], convert_to_tensor=True)

sentence_embeddings = model_sentence.encode(hf_copd_df3['reason_clean'], convert_to_tensor=True)


### Saving embeddings

In [222]:

#save sentence_embeddings
sentence_embeddings_np = np.asarray(sentence_embeddings)

# save radbert_embeddings2 as a file on my computer
np.save('sentence_embeddings.npy', sentence_embeddings_np)


### Loading them for use

In [220]:
#load radbert_embeddings2
sentence_embeddings_np = np.load('sentence_embeddings.npy')

## Topic Model

In [415]:
import numpy as np
from bertopic import BERTopic
from scipy.sparse import csr_matrix
from umap import UMAP
np.random.seed(1997)

umap_model = UMAP(random_state=1997)

topic_model = BERTopic(nr_topics=20, 
                       min_topic_size=25,
                       calculate_probabilities=True,
                       umap_model=umap_model)

topics, probs = topic_model.fit_transform(hf_copd_df3['reason_clean'].astype(str), sentence_embeddings_np)


#### Reducing Outliers & Updating model with reduced outliers

In [416]:
new_topics = topic_model.reduce_outliers(hf_copd_df3['reason_clean'].astype(str), topics, probabilities=probs, strategy="probabilities")

In [417]:
topic_model.update_topics(hf_copd_df3['reason_clean'].astype(str), topics=new_topics)


### Mapping level 2 names to a new dataframe

#### First we need to see the words accoiated with each topic

In [None]:
topic_model_names = topic_model.get_topic_info()

#### Then we can map it

In [449]:
name_mapping = {
    '0_pneumonia_of_cough_breath': 'Pneumonia & Respiratory Symptoms',
    '1_interval_change_pneumothorax_sp': 'Interval Change',
    '2_pna_edema_with_pulmonary': 'Pulmonary Edema',
    '3_pain_chest_with_abdominal': 'Chest & Abdominal Pain',
    '4_placement_tube_et_ett': 'Tube Placement',
    '5_pleural_effusion_effusions_for': 'Pleural Effusion',
    '6_dyspnea_exertion_acute_on': 'Acute Dyspnea',
    '7_stroke_hemorrhage_seizure_subarachnoid': 'Stroke, Hemorrhage, Seizure',
    '8_mental_altered_status_delirium': 'Altered Mental Status',
    '9_hypoxia_oxygen_process_and': 'Hypoxia and Oxygen Requirements',
    '10_pancreatitis_pancreatic_cancer_volume': 'Pancreatitis & Pancreatic Cancer',
    '11_picc_line_placement_ij': 'PICC Line Placement',
    '12_atrial_fibrillation_palpitations_syncope': 'Atrial Fibrillation',
    '13_lead_pacemaker_leads_pp': 'Pacemaker Lead Placement',
    '14_ever_pneumonia_for_and': 'Fever',
    '15_esophageal_esophagectomy_interval_sp': 'Esophageal Cancer',
    '16_weakness_confusion_generalized_with': 'Weakness & Confusion',
    '17_infiltrate_eval_wrising_as': 'Infiltrate Evaluation',
    '18_dizziness_lightheadedness_weakness_headache': 'Dizziness Assessment'
}

# Map the level 3 names onto the DataFrame
topic_model_names['level3_names'] = topic_model_names['Name'].map(name_mapping)


## Making a hierarchical topic model & accompanying dataframe

In [450]:
# Rename the columns
topic_model_names2 = topic_model_names.rename(columns={'Topic': 'level3'})

# Select only the 'level_3' and 'topic_name' columns
topic_model_names3 = topic_model_names2[['level3', 'level3_names']]

### Using ward linkage function

In [421]:
from scipy.cluster import hierarchy as sch

linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(hf_copd_df3['reason_clean'].astype(str), linkage_function=linkage_function)

100%|██████████| 17/17 [00:00<00:00, 100.45it/s]


### Visualizing the hierarchical structure

In [422]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

### Generating new topic structure

In [None]:
import pandas as pd

data = {
    'level1': [
        '1', '1', '1', '2', '2'
    ],
    'level2': [
        [17, 10, 5, 15],  # Category 1
        [9, 6, 12, 3],  # Category 2
        [0, 2, 1, 7],  # Category 3
        [11, 4, 13],  # Category 4
        [14, 8, 18, 16],  # Category 5
    ]
}

# Count the number of numbers in level 2


df = pd.DataFrame(columns=['level1', 'level2', 'level3'])

for i, (level1, row) in enumerate(zip(data['level1'], data['level2']), start=1):
    for num in row:
        df = df.append({'level1': level1, 'level2': i, 'level3': num}, ignore_index=True)

df['level1'] = df['level1'].astype(int)
df['level2'] = df['level2'].astype(int)
df['level3'] = df['level3'].astype(int)


#### Merging it with the level 2 names

In [452]:
# Perform the join based on the 'level_3' column
merged_df = pd.merge(df, topic_model_names3, on='level3')

    level1  level2  level3                      level3_names
0        1       1      17             Infiltrate Evaluation
1        1       1      10  Pancreatitis & Pancreatic Cancer
2        1       1       5                  Pleural Effusion
3        1       1      15                 Esophageal Cancer
4        1       2       9   Hypoxia and Oxygen Requirements
5        1       2       6                     Acute Dyspnea
6        1       2      12               Atrial Fibrillation
7        1       2       3            Chest & Abdominal Pain
8        1       3       0  Pneumonia & Respiratory Symptoms
9        1       3       2                   Pulmonary Edema
10       1       3       1                   Interval Change
11       1       3       7       Stroke, Hemorrhage, Seizure
12       2       4      11               PICC Line Placement
13       2       4       4                    Tube Placement
14       2       4      13          Pacemaker Lead Placement
15       2       5      

#### Naming level 1 hierarchy

In [453]:
# Define the level2 and level1 names
level1_names = {
    1: "Respiratory and Thoracic Conditions",
    2: "Critical Care"
}

level2_names = {
    1: 'Thoracic Disorders ',
    2: 'Respiratory & Cardiovascular Health',
    3: 'Pulmonary & Neurovascular Health',
    4: 'Catheter & Lead Placement',
    5: 'Neurolical Symptoms'
}

merged_df['level2_names'] = df['level2'].map(level2_names)
merged_df['level1_names'] = df['level1'].map(level1_names)


### Saving the hierarchical structure

In [454]:
merged_df.to_csv("../../Data/hierarchical_structure_topic_model.csv")

## Adding topics to original dataframe

In [455]:
df_topic_model = topic_model.get_document_info(hf_copd_df3['reason_clean'].astype(str))

In [456]:
# Create a new column for topics
hf_copd_df3['topic'] = df_topic_model['Topic']

# Optional: Create a new column for topic probabilities
hf_copd_df3['topic_probability'] = df_topic_model['Probability']

hf_copd_df3['Representative_document'] = df_topic_model['Representative_document']

hf_copd_df3['topic_name'] = df_topic_model['Name'].map(name_mapping)

### Saving new dataframe with topic names

In [457]:
hf_copd_df3.to_csv("topic_model_dataset.csv")