In [29]:
import pandas as pd
from sklearn.manifold import TSNE
import xlwings as xw
from sklearn.cluster import DBSCAN
from bioinfokit.visuz import cluster
import plotly.express as px
import math
import numbers
import datetime
from pandas.api.types import is_numeric_dtype

class TsnePlot:
    path = ""
    shapeCol = ""
    colorCol = ""
    df = []
    
    def __init__(self, newPath):
        self.path = newPath
        app = xw.App()
        book = xw.Book(self.path)
        sheet = book.sheets('Master corrected variables')
        self.df = sheet.range('A1').options(pd.DataFrame, expand='table').value
        book.close()
        app.kill()
    
    def setOutputColumns(self, sCol, cCol):
        self.shapeCol = sCol
        self.colorCol = cCol
    
    def getColumns(self):
        return list(self.df)
    
    def standardize(self, dicts):
        df2 = self.df

        i = 0
        for item in dicts:
            if not is_numeric_dtype(self.df[item]):
                replacement = {item: dicts[item]}
                df2 = df2.replace(replacement)
            i = i + 1


        for column in df2:
            df2[column] = df2[column].fillna(0)
            if not isinstance(df2[column][0],datetime.datetime):
                if df2[column].std() != 0:
                    df2[column] = (df2[column] - df2[column].mean()) / df2[column].std()
        return df2
                    
    def getDicts(self, columns):
        columnNumber = 0
        dicts = {}
        for column in self.df:
            i = 1
            thisDict = {}
            uniques = self.df[column].unique()
            for item in uniques:
                if not item in thisDict:
                    if isinstance(item, numbers.Number):
                        thisDict[item] = item
                    else:
                        if i in uniques:
                            j = i
                            while j not in uniques:
                                j = j + 1
                            thisDict[item] = j
                        else:
                            thisDict[item] = i
                            i = i + 1
            dicts[columns[columnNumber]] =  thisDict
            columnNumber = columnNumber + 1
        return dicts

    def getPlot(self):
        ID = self.df.index.values
        self.df = self.df.fillna('')
        self.df = self.df.loc[:,~self.df.columns.duplicated()]
        columns = list(self.df)

        dicts = self.getDicts(columns)

        df2 = self.standardize(dicts)


        tsne_em = TSNE(n_components=2, perplexity=38.0, n_iter=5000, verbose=1).fit_transform(df2)

        df_results = self.df
        df_results['x'] = tsne_em[:,0]
        df_results['y'] = tsne_em[:,1]
        df_results['ID'] = ID
        for row in df_results:
            df_results[row] = df_results[row].astype(float, errors = 'ignore')
        df_results['dob'] = df_results['dob'].astype(object)

        df_results.drop(df_results.loc[df_results['DM']=='null'].index, inplace=True)

        df_results = df_results.apply(pd.to_numeric, errors='ignore')

        fig = px.scatter(df_results, x="x", y="y", color=self.colorCol, symbol=self.shapeCol, hover_data=["ID"])
        fig.update_traces(marker={'size': 7, 'line' : {'color' : 'rgba(0, 0, 0, 0.5)',
                                           'width' : 1}})

        fig.layout.legend.y = 1.05
        fig.layout.legend.x = 1.035
        fig.layout.coloraxis.colorbar.y = 0.35

        fig.show()
        return fig

In [30]:
algoClass = TsnePlot('DR Metformin Data 8192021 from santi for asutin group.xlsb')
algoClass.setOutputColumns("DM" , "DM Duration")
plot = algoClass.getPlot()


elementwise comparison failed; this will raise an error in the future.



[t-SNE] Computing 115 nearest neighbors...
[t-SNE] Indexed 675 samples in 0.000s...
[t-SNE] Computed neighbors for 675 samples in 0.047s...
[t-SNE] Computed conditional probabilities for sample 675 / 675
[t-SNE] Mean sigma: 1.501530
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.774261
[t-SNE] KL divergence after 2550 iterations: 0.567809


In [31]:
print(algoClass.getColumns())

['age', 'gender', 'race', 'ethnicity', 'Final Cohort', 'DR', 'PDR', 'DME', 'VTDR', 'DM', 'Renal ', 'Neural', 'HLD', 'Obesity', 'HTN', 'Tobacco', 'Original Cohort', 'Oldest Age', 'DM Duration', 'dob', 'deceased', 'year decease', 'Eye Exam ', 'Duration', 'x', 'y', 'ID']
