In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

In [2]:
print("Current Working Directory:", os.getcwd())

Current Working Directory: C:\Users\Gebruiker\Documents\Pubmed_Data_Analysis


In [3]:
file_path = "C:/Users/Gebruiker/Documents/Pubmed_Data_Analysis/Data/API/vitamin_d_results_api_20250214_200006.csv"
df = pd.read_csv(file_path)
df['Abstract'] = df['Abstract'].fillna("No abstract available")
df['Keywords'] = df['Keywords'].fillna("No MESH terms available")
df.head()

Unnamed: 0,PMID,Title,Abstract,Authors,Journal,Keywords,URL,Affiliations
0,39946202,Comparison of effect and mechanism between nal...,Pruritus in hemodialysis patients (HDP) is one...,"Choi Sooyeon, Shin Dong Hui, Kim Jae-Seok, Lee...",Annals of medicine,"Humans, Pruritus, Renal Dialysis, Ultraviolet ...",https://www.ncbi.nlm.nih.gov/pubmed/39946202,"Department of Dermatology, Yonsei University, ..."
1,39940457,Vitamin D Deficiency Meets Hill's Criteria for...,Clinical trials consistently demonstrate an in...,Wimalawansa Sunil J,Nutrients,"Humans, COVID-19, Vitamin D Deficiency, Vitami...",https://www.ncbi.nlm.nih.gov/pubmed/39940457,"Endocrinology and Human Nutrition, CardioMetab..."
2,39940396,The Immunomodulatory Activity of High Doses of...,Vitamin D receptor [VDR] expression promotes L...,"Gonçalves Ana Moura, Velho Sónia, Rodrigues Bá...",Nutrients,"Humans, Male, Female, Middle Aged, COVID-19, V...",https://www.ncbi.nlm.nih.gov/pubmed/39940396,"Intensive Care Medicine Department, Hospital B..."
3,39940277,Effects of a Novel Dispersible Supplement Cont...,Vitamins D and B12 play a crucial role in main...,"Angelopoulos Nikolaos, Paparodis Rodis D, Andr...",Nutrients,"Humans, Vitamin D, Middle Aged, Dietary Supple...",https://www.ncbi.nlm.nih.gov/pubmed/39940277,"Hellenic Endocrine Network, Ermou 6 Str., 1056..."
4,39940233,Food Group Consumption and Nutrient Intake by ...,Optimal nutrition is essential for the health ...,"Jin Ying, Coad Jane, Brough Louise",Nutrients,"Humans, Female, Breast Feeding, Adult, New Zea...",https://www.ncbi.nlm.nih.gov/pubmed/39940233,"School of Health Sciences, College of Health, ..."


In [4]:
vectorizer_abstracts = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
vectorizer_titles = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
vectorizer_keywords = TfidfVectorizer(stop_words='english', ngram_range=(1,2))

vectorized_abstracts = vectorizer_abstracts.fit_transform(df['Abstract'])
vectorized_titles = vectorizer_titles.fit_transform(df['Title'])
vectorized_keywords = vectorizer_keywords.fit_transform(df['Keywords'])

In [5]:
# Elbow method

In [6]:
# Elbow method on abstracts
k_values = range(2, 22)  
inertia = []

for k in tqdm(k_values, desc="Running KMeans", unit="cluster"):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(vectorized_abstracts)
    inertia.append(kmeans.inertia_)  # Store inertia (sum of squared distances)

# Step 3: Plot the Elbow Curve
plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, marker='o', linestyle='-', color='b')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia (Sum of Squared Distances)")
plt.title("Elbow Method for Optimal K")
plt.xticks(k_values)
plt.grid(True)
plt.show()

Running KMeans:  70%|███████████████████████████████████████▏                | 14/20 [3:19:59<1:25:42, 857.09s/cluster]

KeyboardInterrupt



In [49]:
# Get top 5 keywords for abstract clusters
# Apply K-Means
kmeans_abstracts = KMeans(n_clusters=20, random_state=42, n_init=10)
kmeans_abstracts.fit(vectorized_abstracts)

feature_names_abstracts = vectorizer_abstracts.get_feature_names_out()

# Get top 5 words per cluster
num_words = 5
for i, centroid in enumerate(kmeans_abstracts.cluster_centers_):
    top_indices = centroid.argsort()[-num_words:][::-1]
    top_words = [feature_names_abstracts[idx] for idx in top_indices]
    print(f"Cluster {i}: {', '.join(top_words)}")

Cluster 0: ohd, 25 ohd, 25, ohd levels, 25 ohd levels
Cluster 1: vitamin, children, patients, studies, study
Cluster 2: cinacalcet, fgf23, pth, patients, phosphate
Cluster 3: abstract available, abstract, available, 00 0001 methodological, 00 0003
Cluster 4: bmd, bone, bone mineral, spine, mineral
Cluster 5: ckd, patients, paricalcitol, kidney, fgf
Cluster 6: sup, sup sup, vitamin, studies, 95
Cluster 7: insulin, glucose, vitamin, insulin resistance, ir
Cluster 8: oh, d3, 25 oh, oh d3, 25
Cluster 9: muscle, strength, muscle strength, exercise, physical
Cluster 10: sub, sub sub, vitamin sub sub, vitamin sub, oh sub
Cluster 11: cancer, intake, dietary, breast, risk
Cluster 12: oh, 25 oh, 25, vitamin, serum 25
Cluster 13: covid, covid 19, 19, patients, vitamin
Cluster 14: vitamin, group, placebo, iu, supplementation
Cluster 15: vitamin, studies, deficiency, vitamin deficiency, levels
Cluster 16: vdr, polymorphisms, polymorphism, bsmi, foki
Cluster 17: patients, calcium, fracture, vitamin,

In [45]:
# Assign clusters to all research paper
# Get the cluster labels for each paper
cluster_labels = kmeans_abstracts.predict(vectorized_abstracts)

# Now you have the cluster labels for each paper
# For example, printing the cluster assignment for each paper
for idx, label in enumerate(cluster_labels):
    print(f"Paper {idx} is assigned to Cluster {label}")


Paper 0 is assigned to Cluster 0
Paper 1 is assigned to Cluster 4
Paper 2 is assigned to Cluster 0
Paper 3 is assigned to Cluster 1
Paper 4 is assigned to Cluster 19
Paper 5 is assigned to Cluster 0
Paper 6 is assigned to Cluster 6
Paper 7 is assigned to Cluster 0
Paper 8 is assigned to Cluster 0
Paper 9 is assigned to Cluster 0
Paper 10 is assigned to Cluster 9
Paper 11 is assigned to Cluster 19
Paper 12 is assigned to Cluster 12
Paper 13 is assigned to Cluster 0
Paper 14 is assigned to Cluster 0
Paper 15 is assigned to Cluster 0
Paper 16 is assigned to Cluster 1
Paper 17 is assigned to Cluster 0
Paper 18 is assigned to Cluster 0
Paper 19 is assigned to Cluster 1
Paper 20 is assigned to Cluster 0
Paper 21 is assigned to Cluster 18
Paper 22 is assigned to Cluster 6
Paper 23 is assigned to Cluster 1
Paper 24 is assigned to Cluster 3
Paper 25 is assigned to Cluster 6
Paper 26 is assigned to Cluster 0
Paper 27 is assigned to Cluster 0
Paper 28 is assigned to Cluster 0
Paper 29 is assigned

In [None]:
# Elbow method on titles
k_values2 = range(2, 22)  
inertia2 = []

for k in tqdm(k_values2, desc="Running KMeans", unit="cluster"):
    kmeans_titles = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_titles.fit(vectorized_titles)
    inertia2.append(kmeans_titles.inertia_)  # Store inertia (sum of squared distances)

# Step 3: Plot the Elbow Curve
plt.figure(figsize=(8, 5))
plt.plot(k_values2, inertia2, marker='o', linestyle='-', color='b')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia (Sum of Squared Distances)")
plt.title("Elbow Method for Optimal K (titles)")
plt.xticks(k_values2)
plt.grid(True)
plt.show()

In [None]:
# Get top 5 keywords for title clusters
# Apply K-Means
kmeans_titles = KMeans(n_clusters=20, random_state=42, n_init=10)
kmeans_titles.fit(vectorized_titles)

feature_names_titles = vectorizer_titles.get_feature_names_out()

# Get top 5 words per cluster
num_words = 5
for i, centroid in enumerate(kmeans_titles.cluster_centers_):
    top_indices = centroid.argsort()[-num_words:][::-1]
    top_words = [feature_names_titles[idx] for idx in top_indices]
    print(f"Cluster {i}: {', '.join(top_words)}")


In [None]:
#Elbow method on MESH terms
k_values3 = range(2, 22)  
inertia3 = []

for k in tqdm(k_values3, desc="Running KMeans", unit="cluster"):
    kmeans_keywords = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_keywords.fit(vectorized_keywords)
    inertia3.append(kmeans.inertia_)  # Store inertia (sum of squared distances)

# Step 3: Plot the Elbow Curve
plt.figure(figsize=(8, 5))
plt.plot(k_values3, inertia3, marker='o', linestyle='-', color='b')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia (Sum of Squared Distances)")
plt.title("Elbow Method for Optimal K (MESH terms)")
plt.xticks(k_values3)
plt.grid(True)
plt.show()

In [None]:
# Get top 5 keywords for MESH term clusters
# Apply K-Means
kmeans_keywords = KMeans(n_clusters=20, random_state=42, n_init=10)
kmeans_keywords.fit(vectorized_keywords)

feature_names_keywords = vectorizer_keywords.get_feature_names_out()

# Get top 5 words per cluster
num_words = 5
for i, centroid in enumerate(kmeans_keywords.cluster_centers_):
    top_indices = centroid.argsort()[-num_words:][::-1]
    top_words = [feature_names_keywords[idx] for idx in top_indices]
    print(f"Cluster {i}: {', '.join(top_words)}")

