In [8]:
import pandas as pd
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

Load Dataset from Hugging Face

In [3]:
dataset = load_dataset("BI55/MedText", split = 'train')
dataset

Dataset({
    features: ['Prompt', 'Completion'],
    num_rows: 1412
})

In [4]:
def format_text(text,line_length= 15):
  print_word = ''
  for i,word in enumerate(text.split(' ')):
    print_word+=word + ' '
    i += 1
    print_word =print_word + '\n' if i%line_length == 0 else print_word

  return print_word

In [5]:
print('Prompt\n')
print(format_text(dataset['Prompt'][0]))
print('\nCompletition\n')
print(format_text(dataset['Completion'][0]))

Prompt

A 50-year-old male presents with a history of recurrent kidney stones and osteopenia. He has 
been taking high-dose vitamin D supplements due to a previous diagnosis of vitamin D deficiency. 
Laboratory results reveal hypercalcemia and hypercalciuria. What is the likely diagnosis, and what is the 
treatment? 

Completition

This patient's history of recurrent kidney stones, osteopenia, and high-dose vitamin D supplementation, along with 
laboratory findings of hypercalcemia and hypercalciuria, suggest the possibility of vitamin D toxicity. Excessive intake 
of vitamin D can cause increased absorption of calcium from the gut, leading to hypercalcemia 
and hypercalciuria, which can result in kidney stones and bone loss. Treatment would involve stopping 
the vitamin D supplementation and potentially providing intravenous fluids and loop diuretics to promote the 
excretion of calcium. 


Topic Modeling using Latent Semantic Analysis (LSA)

In [9]:
# Combine prompt and prompt columns into a single text field
texts = [p + " " + c for p, c in zip(dataset['Prompt'], dataset['Completion'])]
len(texts)

1412

In [10]:
# Create a pipeline for LSA
lsa_pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english'),
    TruncatedSVD(n_components=5, random_state=42)
)

# Fit and transform the data
lsa_output = lsa_pipeline.fit_transform(texts)

In [11]:
def get_top_terms(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_terms = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_terms)
    return topics

vectorizer = lsa_pipeline.named_steps['tfidfvectorizer']
svd_model = lsa_pipeline.named_steps['truncatedsvd']

feature_names = vectorizer.get_feature_names_out()
topics = get_top_terms(svd_model, feature_names, 10)

In [12]:
for i, topic in enumerate(topics):
    print(f"Topic {i + 1}: {', '.join(topic[:5])}")

Topic 1: pain, symptoms, steps, patient, diagnosis
Topic 2: iron, anemia, thyroid, fatigue, deficiency
Topic 3: chest, blood, heart, ct, liver
Topic 4: pain, fracture, bowel, weight, knee
Topic 5: iron, anemia, deficiency, infection, count
