In [5]:
import pandas as pd
import numpy as np
import requests
import json
import time
import os 
import gzip
import timeit
import matplotlib.pyplot as plt
from loguru import logger
from sklearn.manifold import TSNE
import umap
#import umap.plot
from scripts.vectology_functions import embed_text, create_aaa_distances
import seaborn as sns

# Apply the default theme
sns.set_theme()

tSNE=TSNE(n_components=2)

cci_df = pd.read_csv('data/CCI-ICD10CM-v2021-1/CCI_ICD10CM_v2021-1.csv',skiprows=2)
cci_df.rename(columns={"'ICD-10-CM CODE DESCRIPTION'":"ICD-10-CM CODE DESCRIPTION","'CHRONIC INDICATOR'":"CHRONIC INDICATOR"},inplace=True)
cci_df['CHRONIC INDICATOR'] = cci_df['CHRONIC INDICATOR'].str.replace("'","")

output='output/disease-cat'

    You can install these via pip using

    pip install umap-learn[plot]

    or via conda using

     conda install pandas matplotlib datashader bokeh holoviews colorcet scikit-image
    
  """


ImportError: umap.plot requires pandas matplotlib datashader bokeh holoviews scikit-image and colorcet to be installed

In [3]:
%%time

def make_vectors():
    f = f'{output}/vectors.pickle.gz'
    if os.path.exists(f):
        logger.info(f'{f} exists')
        res_df = pd.read_pickle(f)
    else:
        logger.info(f'\n{cci_df.head()}')
        logger.info(cci_df.shape)

        results=[]
        query_list = list(cci_df['ICD-10-CM CODE DESCRIPTION'])
        indicator_list = list(cci_df['CHRONIC INDICATOR'])
        #logger.info(queries)
        chunk=20
        for i in range(0,len(query_list),chunk):
            if i % 1000 == 0:
                print(i)
            batch = query_list[i:i+chunk]
            res = embed_text(textList=batch,model='BioSentVec')
            for r in res:
                results.append({'name':query_list[i],'indicator':indicator_list[i],'vector':r})
        logger.info(len(results))
        res_df = pd.DataFrame(results)
        res_df.to_pickle(f)
    return res_df
res_df = make_vectors()

2021-04-12 22:18:35.278 | INFO     | __main__:make_vectors:4 - output/disease-cat/vectors.pickle.gz exists


CPU times: user 5.22 s, sys: 887 ms, total: 6.11 s
Wall time: 6.11 s


In [4]:
%%time

logger.info(len(res_df))
aaa = create_aaa_distances(list(res_df['vector'])[:100])
np.save(f'{output}/vectors-aaa.npy',aaa)

2021-04-12 22:18:44.696 | INFO     | __main__:<module>:1 - 73205


Creating distances...
100
100
CPU times: user 20.9 ms, sys: 2.88 ms, total: 23.8 ms
Wall time: 21.5 ms


In [None]:
%%time

# tSNE

cci_df = cci_df.head(n=100)

d = pd.DataFrame(aaa)
logger.info(d.head())
logger.info(d.shape)
d = d.fillna(1)
exit()
tSNE_result=tSNE.fit_transform(d)
x=tSNE_result[:,0]
y=tSNE_result[:,1]
cci_df['x']=x
cci_df['y']=y
logger.info(cci_df.head())
logger.info(cci_df.shape)
plt.figure(figsize=(16,16))
sns.scatterplot(x='x',y='y',data=cci_df, legend="full", hue="CHRONIC INDICATOR")
plt.savefig(f'{output}/tsne.pdf')

2021-04-12 22:21:24.301 | INFO     | __main__:<module>:6 -          0         1         2         3         4         5         6   \
0  0.000000  0.027015  0.567728  0.739140  0.739958  0.777459  0.716014   
1  0.027015  0.000000  0.571727  0.743126  0.733656  0.782226  0.716638   
2  0.567728  0.571727  0.000000  0.329781  0.633168  0.676889  0.647810   
3  0.739140  0.743126  0.329781  0.000000  0.398672  0.340710  0.383392   
4  0.739958  0.733656  0.633168  0.398672  0.000000  0.448165  0.228155   

         7         8         9   ...        90        91        92        93  \
0  0.731274  0.714448  0.708838  ...  0.908337  0.908717  0.881235  0.905757   
1  0.732058  0.714512  0.727792  ...  0.892908  0.890377  0.886932  0.907834   
2  0.652647  0.680688  0.648430  ...  0.835625  0.835591  0.781407  0.491576   
3  0.422022  0.447839  0.314620  ...  0.716952  0.724693  0.734874  0.464014   
4  0.279590  0.274474  0.408206  ...  0.551350  0.681834  0.718513  0.728980   

         

In [None]:
# umap
mapper = umap.UMAP().fit(aaa)
