# Clustering of patients
This notebook accompanies the paper "Hospital-wide Natural Language Processing summarising the health data of 1 million patients". Due to the sensitivity of the data used in the paper we are unable to share the data. The NLP dataset was produced using the [MedCAT library](https://github.com/CogStack/MedCAT).

More details of the data are available in the paper and are described in the [HDRUK Gateway](https://web.www.healthdatagateway.org/dataset/4e8d4fed-69d6-402c-bd0a-163c23d6b0ee).

In [None]:
import pickle
import pandas as pd
from collections import defaultdict
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph

In [None]:
%matplotlib inline

In [None]:
min_count = 2 # minimum number of detections for a concept to be considered true at patient level
min_pts = 100 # minimum number of patients with a concept for it to be included in the embedding
subsample = True # whether to run analysis on a sample (True) or all (False) patients
sample_size = 100000 # size of sample in number of patients if subsample is true

In [None]:
# load your patient data
# a dict with { patient_id : { concept_id: number_of_mentions }}
# this cannot be distributed due to patient privacy
with open('./path/to/data.pickle', 'rb') as f:
    counts = pickle.load(f)

In [None]:
# a file containing all disorder codes in SNOMED
# this cannot be distributed due to licensing
with open('./path/to/disorders.csv', 'rb') as f:
    disorder_cui = pickle.load(f)

In [None]:
if subsample:
    sample_pts = np.random.choice(list(counts.keys()), sample_size, False)
else:
    sample_pts = list(counts.keys())

In [None]:
all_concepts = defaultdict(int)

In [None]:
#number of patients with each concept detected
for cnt in counts.values():
    for c, v in cnt.items():
        if v >= min_count:
            all_concepts[c] += 1

In [None]:
len(all_concepts) # 26842

# Prepare input data
## disease concepts only, 1k threshold

In [None]:
def embed_count(counts, order):
    x = [counts.get(c, 0) for c in order]
    return x

In [None]:
cui_keep = [x for x in all_concepts if all_concepts[x] >= 1000 and x in disorder_cui]
print(len(cui_keep)) #872

In [None]:
embedded = [embed_count(counts[x], cui_keep) for x in sample_pts]

In [None]:
data = np.array(embedded)
data = data[data.sum(axis=1) != 0, :] #remove patients with no detections for these concepts

In [None]:
data.sum() #3261699

In [None]:
pt_totals = np.sum(data, axis=1)

In [None]:
pt_totals.shape[0] # 87050

In [None]:
data_norm = (data.T / pt_totals).T

# PCA -> TSNE

In [None]:
per = 200
it = 1000
metric = 'cosine'
pca = PCA(n_components=50)
data_pca_sample = pca.fit_transform(data_norm)
tsne_sample = TSNE(n_components=2, n_jobs=20, perplexity=per, n_iter=it, metric=metric, init ='pca', learning_rate='auto',).fit_transform(data_pca_sample)

# Clustering

In [None]:
n_clusters = 50

In [None]:
knn_graph = kneighbors_graph(tsne_sample, 100, include_self=False)

ward = AgglomerativeClustering(
    n_clusters=n_clusters, linkage="ward", connectivity=knn_graph
)

ward.fit(tsne_sample)

# Figures

In [None]:
# basic scatter plot 

fig, axs = plt.subplots()
axs.scatter(x=tsne_sample[:,0], y=tsne_sample[:,1])
bbox_props = dict(boxstyle="round", fc="w", ec="0.5", alpha=0.9)
axs.set_title(f"Clusters = {n_clusters}")
fig.set_size_inches(10, 10)
plt.savefig('./path/to/figure.png')

In [None]:
#adding cluster labels

fig, axs = plt.subplots()
axs.scatter(x=tsne_sample[:,0], y=tsne_sample[:,1], c=ward.labels_, cmap=plt.cm.nipy_spectral)
bbox_props = dict(boxstyle="round", fc="w", ec="0.5", alpha=0.9)

for x in range(n_clusters):
    w = [c == x for c in ward.labels_]
    cl = tsne_sample[w,:]
    cen = cl.mean(axis=0)
    #ax.annotate(x, cen)
    axs.text(cen[0], cen[1],str(x), ha="center", va="center", size=20, bbox=bbox_props)
axs.set_title(f"Clusters = {n_clusters}")
fig.set_size_inches(10, 10)
plt.savefig('./path/to/figure_labelled.png')