In [1]:
from umap import UMAP
import plotly.express as px
import pandas as pd
import h5py
import kaleido

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Path to your h5 file
h5_file_path = '/Users/julius/Gobi/Final_results/UMAP/Final_embeddings_withhumans.h5'

# Initialize an empty list to store the embeddings and their names
embeddings_list = []
names_list = []

# Open the h5 file and iterate over each dataset
with h5py.File(h5_file_path, 'r') as h5_file:
    for name in h5_file:
        # Read the current embedding
        current_embedding = h5_file[name][:]#.reshape(-1) 
        print(current_embedding)
        # Append the embedding and its name to the lists
        embeddings_list.append(current_embedding)
        names_list.append(name)

# Convert the list of embeddings into a DataFrame
# If embeddings are 1D, this will create a DataFrame where each row is an embedding
embeddings_df = pd.DataFrame(embeddings_list)

# Add the names as a new column or set them as the index
embeddings_df['name'] = names_list  # As a new column
# embeddings_df.index = names_list  # Or set as the index

embeddings_df.head()  # Display the first few rows of the DataFrame

[ 0.01946249 -0.01578297  0.01494813 ...  0.00400508  0.03246194
  0.00652283]
[-0.00860984 -0.02216995  0.02207009 ...  0.00152256  0.05491656
  0.02764264]
[-0.00607203 -0.03189671  0.01638917 ...  0.00564662  0.04674073
  0.02070883]
[ 0.01988445 -0.04099694 -0.01255231 ...  0.00575879  0.03103622
  0.01572775]
[-3.8975612e-03 -2.6136126e-02  2.3674501e-02 ...  2.7329081e-05
  1.8414490e-02  5.0153430e-03]
[-0.00243    -0.02553847 -0.00477982 ... -0.00429879  0.03813627
  0.01442827]
[ 0.00360411 -0.02171046  0.01908977 ...  0.00058963  0.04308593
  0.01182402]
[ 0.02011294 -0.03229322 -0.01789315 ...  0.01741026  0.02141275
  0.00929396]
[-0.00261895 -0.02158622  0.01830859 ... -0.00395294  0.03061792
  0.01450235]
[0.02532774 0.00115199 0.00500673 ... 0.00700242 0.01060579 0.01259328]
[-0.01484244 -0.04088761  0.01689627 ... -0.00581502  0.04533565
  0.01808536]
[ 0.01277238  0.03743351  0.05188111 ...  0.00811023 -0.00597477
  0.02707446]
[ 0.00550779 -0.04224633 -0.02104675 ... 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,name
0,0.019462,-0.015783,0.014948,-0.003455,0.037543,0.043632,-0.018803,-0.05457,0.012157,-0.012823,...,-0.015313,0.009878,-0.005597,0.047736,0.012398,-0.031391,0.004005,0.032462,0.006523,Anopheles gambiae|Timeless|XP_061517387_1
1,-0.00861,-0.02217,0.02207,0.023097,0.033049,0.016864,-0.029285,-0.098811,0.018898,-0.001906,...,-0.021229,0.026254,0.02518,-0.00283,0.009956,-0.023883,0.001523,0.054917,0.027643,Anopheles gambiae|Timeout|XP_061513618_1
2,-0.006072,-0.031897,0.016389,0.016768,0.02146,0.030698,-0.037818,-0.098392,0.016611,-0.00025,...,-0.014677,0.029919,0.022509,0.032172,0.017781,-0.018412,0.005647,0.046741,0.020709,Apis mellifera|Timeout|XP_006565495_2
3,0.019884,-0.040997,-0.012552,0.015286,0.025412,0.049742,-0.029702,-0.065657,0.035368,-0.007768,...,-0.002238,0.019777,0.002655,0.046834,0.026249,-0.017366,0.005759,0.031036,0.015728,Bombyx mori|Timeless|XP_037877669_1
4,-0.003898,-0.026136,0.023675,0.015945,0.024668,0.041806,-0.034914,-0.089566,0.016355,-0.002214,...,0.001667,0.035486,0.040717,0.037069,0.014405,-0.023252,2.7e-05,0.018414,0.005015,Bombyx mori|Timeout|XP_012545048_1


In [3]:
from umap import UMAP

# Assuming 'embeddings_df' is your DataFrame from the previous step

# Separate the embeddings from the names
embeddings = embeddings_df.drop('name', axis=1).values  # Use only the embeddings for UMAP
names = embeddings_df['name']  # The names or identifiers of the embeddings

# Perform UMAP dimensionality reduction
umap_2d = UMAP(n_components=2, n_neighbors=25, metric='euclidean', min_dist=0.5, init='random', random_state=42)  # 2D projection
umap_projections_2d = umap_2d.fit_transform(embeddings)

# Create a DataFrame for the UMAP projections
umap_df_2d = pd.DataFrame(umap_projections_2d, columns=['UMAP_1', 'UMAP_2'])
umap_df_2d['name'] = names  # Add the names back to associate with the projections

# If you also want a 3D projection
umap_3d = UMAP(n_components=3, n_neighbors=25, metric='euclidean', min_dist=0.5, init='random', random_state=42)  # 3D projection
umap_projections_3d = umap_3d.fit_transform(embeddings)

# Create a DataFrame for the 3D UMAP projections
umap_df_3d = pd.DataFrame(umap_projections_3d, columns=['UMAP_1', 'UMAP_2', 'UMAP_3'])
umap_df_3d['name'] = names  # Add the names back

# Extract labels from the 'name' column
umap_df_2d['label'] = umap_df_2d['name'].apply(lambda x: x.split('|')[0] )
umap_df_3d['label'] = umap_df_3d['name'].apply(lambda x: x.split('|')[0] )

umap_df_2d['Gene'] = umap_df_2d['name'].apply(lambda x: x.split('|')[1] )
umap_df_3d['Gene'] = umap_df_3d['name'].apply(lambda x: x.split('|')[1] )
# Now you have two DataFrames: `umap_df_2d` and `umap_df_3d` with the 2D and 3D UMAP projections, respectively,
# and each row in these DataFrames corresponds to an embedding with its UMAP projections and the original name.

# You can display the first few rows of the DataFrame to check
umap_df_2d.head()  # For 2D projections
umap_df_3d.head()  # For 3D projections

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(


Unnamed: 0,UMAP_1,UMAP_2,UMAP_3,name,label,Gene
0,-7.237299,6.5824,8.434443,Anopheles gambiae|Timeless|XP_061517387_1,Anopheles gambiae,Timeless
1,-9.97,9.424144,8.490544,Anopheles gambiae|Timeout|XP_061513618_1,Anopheles gambiae,Timeout
2,-11.055582,9.989149,8.095532,Apis mellifera|Timeout|XP_006565495_2,Apis mellifera,Timeout
3,-9.010216,6.447857,7.773969,Bombyx mori|Timeless|XP_037877669_1,Bombyx mori,Timeless
4,-11.356917,9.937396,6.838923,Bombyx mori|Timeout|XP_012545048_1,Bombyx mori,Timeout


In [4]:
import plotly.express as px

# Plotting 2D UMAP projections with color coding by labels
fig_2d = px.scatter(
    umap_df_2d,
    x='UMAP_1',
    y='UMAP_2',
    color='label',  # Color code by the extracted label
    symbol='Gene',
    hover_data=['name'],  # Show original name on hover
    title='2D UMAP Projections of Embeddings with Labels'
)

fig_2d.show()


In [6]:
import plotly.graph_objs as go

label_color_dict = {
    'Apis mellifera': '#1f77b4',  # Blau
    'Bombyx mori': '#ff7f0e',     # Orange
    'Centruroides sculpturatus': '#2ca02c',  # Grün
    'Danaus plexippus': '#d62728',  # Rot
    'Drosophila melanogaster': '#9467bd',  # Lila
    'Ixodes scapularis': '#8c564b',  # Braun
    'Nicrophorus vespilloides': '#e377c2',  # Pink
    'Parasteatoda tepidariorum': '#7f7f7f',  # Grau
    'Solenopsis invicta': '#bcbd22',  # Olive
    'Trichoplusia ni': '#17becf',   # Teal
    'Nicrophorus vespilloides Both': '#d62727',  # Rot
    'Centruroides sculpturatus Both': '#d62726',  # Rot
    'Anopheles gambiae': '#f77189',  # Hellrot
    'Cotesia glomerata': '#f7754f',  # Helles Orange
    'Microplitis demolitor': '#dc8932',  # Goldorange
    'Nasonia vitripennis': '#c39532',  # Sandgelb
    'Tribolium castaneum': '#ae9d31',  # Mattgold
    'Homo sapiens': '#87CEEB'  # Himmelblau
}

gene_marker_dict = {
    'Timeout': 'circle',
    'Timeless': 'diamond'
}

# Extract the unique labels and genes
unique_labels = umap_df_3d['label'].unique()
unique_genes = umap_df_3d['Gene'].unique()

# Initialize an empty list to store all the plotly Scatter3D objects
traces = []

# Iterate over the labels and genes to create a Scatter3D object for each combination
for label in unique_labels:
    for gene in unique_genes:
        subset_df = umap_df_3d[(umap_df_3d['label'] == label) & (umap_df_3d['Gene'] == gene)]
        if not subset_df.empty:  # Check if the subset is not empty
            trace = go.Scatter3d(
                x=subset_df['UMAP_1'],
                y=subset_df['UMAP_2'],
                z=subset_df['UMAP_3'],
                mode='markers',
                marker=dict(
                    color=label_color_dict[label],  # Set the color for the current label
                    symbol=gene_marker_dict[gene],  # Set the symbol for the current gene
                    size=15,  # Marker size, adjust as needed
                    line=dict(
                        color='black',  # Border color
                        width=2        # Border width
                    )
                ),
                name=f"{label} | {gene}"  # Combine label and gene for legend
            )
            traces.append(trace)

# Define the layout for your plot
layout = go.Layout(
    title='3D UMAP Projections of Embeddings with Custom Labels and Genes',
    legend=dict(
        title='Legend',
        itemsizing='constant'
    ),
    scene=dict(  # Adjusting 3D plot scene
        xaxis=dict(
            showticklabels=False,  # Remove tick labels
            title_text='',  # Remove axis title
        ),
        yaxis=dict(
            showticklabels=False,
            title_text='',
        ),
        zaxis=dict(
            showticklabels=False,
            title_text='',
        )
    ),
    margin=dict(l=0, r=0, b=0, t=0)
)

# Create the figure with all the traces and layout
fig = go.Figure(data=traces, layout=layout)

fig.update_layout(width=1000, height=1000,     scene=dict(
        camera = dict(
    eye=dict(x=1., y=2.5, z=0.5)
)

    ),)
# Show the figure
#fig.show()

fig.write_image("fig1.png")
