In [1]:
import numpy as np
import umap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def visualize(embeddings: list,
                labels: list,
                sizes: list,
                color: list,
                title: str,
                n_components: int = 3,
                random_state: int = 42,
                metric: str = "cosine"
    ):
    
    import plotly.express as px
    import umap
    import pandas as pd
    import numpy as np
    
    mapper = umap.UMAP(n_components=n_components, 
                        random_state=random_state, 
                        metric=metric,
                        n_neighbors=5)
    
    reduced_embeddings = mapper.fit_transform(embeddings)
    
    # Create a DataFrame with reduced embeddings and labels
    df = pd.DataFrame(reduced_embeddings, columns=['X', 'Y', 'Z'])
    
    df['Labels'] = labels
    
    sizes = np.array(sizes)
    
    df["Size"] = sizes
    
    df["Color"] = color
        
    # Create the plot using Plotly
    fig = px.scatter_3d(df, x='X', y='Y', z='Z', hover_data=['Labels'], 
                        size='Size', color='Color')
    
    # Remove the marker outline
    # fig.update_traces(marker=dict(line=dict(width=0)))
    
    # Customize the layout and display the plot
    fig.update_layout(title=title, autosize=True)
    return fig


def main_viz(file_path, mode):
    import numpy as np
    mats = np.load(file_path)
    from copy import deepcopy
    
    sizes= []
    color = []
    for i,lab in enumerate(mats["labels"]):
        if "anchor" in lab:
            sizes.append(2)
            color.append("anchor")
        else:
            sizes.append(1)
            color.append(mats["labels"][i])

            
    fig = visualize(mats[mode], mats["labels"], sizes, color = color,  title = file_path + mode)
    fig.show()

In [3]:
main_viz("/home/pholur/dna2vec/tests/test_cache/clustering/cluster_train + subsequenced_5000_5.npz",
         "train")

In [4]:
main_viz("/home/pholur/dna2vec/tests/test_cache/clustering/cluster_select genes_5000_-1.npz",
         "train")

# coding and non-coding regions could have different densities
# clustering all the anchors - density of the region
# dense vs sparse regions

In [5]:
main_viz("/home/pholur/dna2vec/tests/test_cache/clustering/cluster_select chimp 2a 2b_6000_-1.npz",
         "train")

In [7]:
main_viz("/home/pholur/dna2vec/tests/test_cache/clustering/cluster_from chromosome 3_6000_-1.npz",
         "train")

In [None]:
'''
Showing figures
Train on the entire genome
Overlap during testing 
Accuracy and speed benchmarks / memory - speed
Fine alignment benchmarking
Better searching with edits, pangenome
Symphony - similarity between genomes
Train on human, test on gorilla genome
A replacement to BLAST


Main Point:
- Computational efficiency 
- Cool interpretations in semantic space
- Performance
- How many operations are done in a standard suffix tree?
- Encoding of the model orders of magnitude smaller than the classical
- Gorilla comparison?
'''

# Existing dataset >
# Edit distance (1)
# Targeted exploration of known repeats >
# Scale to other chromosomes >
# journal or conference >
# probes 