In [92]:
import pandas as pd
from sklearn.manifold import TSNE
import xlwings as xw
from sklearn.cluster import DBSCAN
from bioinfokit.visuz import cluster
import plotly.express as px
import math

In [93]:
app = xw.App()
book = xw.Book('DR Metformin Data 8192021 from santi for asutin group.xlsb')
sheet = book.sheets('Data for Analysis (2)')
df = sheet.range('A1').options(pd.DataFrame, expand='table').value
book.close()
app.kill()

In [94]:
df["HTN Class"].value_counts()

PreHTN     749
Stage 1    312
Normal     162
Stage 2     18
Name: HTN Class, dtype: int64

In [95]:
cleanup = {"HTN Class": {"PreHTN": 0, "Stage 1": 1, "Stage 2" : 2, "Normal": 3}}

In [96]:
df = df.dropna()

In [97]:
df['cohort'] = df['pdr'] + df['dme']

In [98]:
df2 = df[['a1c', 'Revised Type', 'Metformin Use (Y/N)', 'HTN Class', 'cohort',
       'Duration of DM']].copy()
df2 = df2.replace(cleanup)

In [122]:
df2['a1c'] = (df2['a1c'] - df2['a1c'].mean()) / df2['a1c'].std()
df2['Revised Type'] = (df2['Revised Type'] - df2['Revised Type'].mean()) / df2['Revised Type'].std()
df2['Metformin Use (Y/N)'] = (df2['Metformin Use (Y/N)'] - df2['Metformin Use (Y/N)'].mean()) / df2['Metformin Use (Y/N)'].std()
df2['HTN Class'] = (df2['HTN Class'] - df2['HTN Class'].mean()) / df2['HTN Class'].std()
df2['cohort'] = (df2['cohort'] - df2['cohort'].mean()) / df2['cohort'].std()
df2['Duration of DM'] = (df2['Duration of DM'] - df2['Duration of DM'].mean()) / df2['Duration of DM'].std()

In [123]:
Target = df.index.values

In [124]:
tsne_em = TSNE(n_components=2, perplexity=38.0, n_iter=5000, verbose=1).fit_transform(df2)

[t-SNE] Computing 115 nearest neighbors...
[t-SNE] Indexed 1221 samples in 0.005s...
[t-SNE] Computed neighbors for 1221 samples in 0.109s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1221
[t-SNE] Computed conditional probabilities for sample 1221 / 1221
[t-SNE] Mean sigma: 0.622025
[t-SNE] KL divergence after 250 iterations with early exaggeration: 56.799187
[t-SNE] KL divergence after 5000 iterations: 0.415905


In [125]:
#cluster.tsneplot(score=tsne_em)

In [126]:
#color_class = df['Duration of DM'].to_numpy()
#cluster.tsneplot(score=tsne_em, colorlist=color_class, legendpos='upper right', legendanchor=(1.15, 1) )

In [127]:
get_clusters = DBSCAN(eps=4, min_samples=10).fit_predict(tsne_em)

In [128]:
set(get_clusters)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

In [129]:
cluster.tsneplot(score=tsne_em, colorlist=get_clusters, 
    legendpos='upper right', legendanchor=(1.15, 1))

In [130]:
df_results = pd.DataFrame({"x":tsne_em[:,0], "y":tsne_em[:,1], "cluster":[int(k) for k in get_clusters], "ID" : Target, "HTN" : df['HTN Class'].values,
                           "Duration of DM" : df['Duration of DM'].values, "a1c":df['a1c'].values, "cohort" : df['cohort'].values, "Type" : df['Revised Type'].values})

In [135]:
fig = px.scatter(df_results, x="x", y="y", color="HTN", hover_data=["ID"])
fig.show()