**Libraries and dependencies**

In [None]:
!sudo-g5k pip install -U scikit-learn umap-learn plotly xlsx2csv

In [1]:
import time
from datetime import timedelta
import polars as pl
import polars.selectors as cs

import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib.pyplot import figure
from sklearn.preprocessing import StandardScaler

import umap.umap_ as umap
from umap import UMAP
import hdbscan

  from .autonotebook import tqdm as notebook_tqdm


**Read data**

In [3]:
m = pl.read_excel(source="~/Ocean_IA/group_storage/TARA/OM_RGC_V2/Salazar_et_al_2019_Suppl_Info.xlsx",
                  sheet_name="Table_W4",
                  read_csv_options={"null_values": ['NA'], "n_rows": 180})
m = m.rename({"PANGAEA sample id":"PANGAEA.sample.id"})
print(f"Environmental dataframe dimensions: {m.shape}")
 
metaG = pl.read_csv('~/Ocean_IA/group_storage/TARA/OM_RGC_V2/OM-RGC_v2_gene_profile_metaG.tsv.gz',
                    separator='\t',
                    dtypes=([pl.Utf8]+[pl.Float64]*180))
print(f"MetaG dataframe dimensions: {metaG.shape}")

Environmental dataframe dimensions: (180, 38)
MetaG dataframe dimensions: (46775154, 181)


**Tranpose**

In [4]:
t0 = time.time()
s_t = metaG.transpose(include_header=True, column_names=metaG["OMRGC_ID"])
t2= time.time()
print(f"Check tranpose. Time = {str(timedelta(seconds=round(t2-t0)))}")
s_t = s_t.slice(1)
t3=time.time()
print(f"Chek slice. Time = {str(timedelta(seconds=round(t3-t0)))}")
s_t = s_t.cast({cs.starts_with("OM"):pl.Float64})
t4=time.time()
print(f"Check dtype changes. Time = {str(timedelta(seconds=round(t4-t0)))}")
s_t = s_t.rename({"column":"PANGAEA.sample.id"})
t5=time.time()
print(f"Check rename. Time = {str(timedelta(seconds=round(t5-t0)))}")
print("Success. New dataframe dimensions:", s_t.shape)


Check tranpose. Time = 0:19:27
Chek slice. Time = 0:19:50


**Subset without MIX layer**

In [11]:
MIX = m.filter(pl.col("Layer") == "MIX")
print("MIX df dimensions:", MIX.shape)

L = MIX["PANGAEA.sample.id"].to_list()

MIX df dimensions: (7, 38)


In [None]:
metaG_test = s_t.filter(pl.col("PANGAEA.sample.id") != L)
print("Filtered metaG df dimensions:", metaG_test.shape)

ColumnNotFoundError: unable to find column "PANGAEA.sample.id"; valid columns: ["OMRGC_ID", "TARA_X000000368", "TARA_Y200000002", "TARA_A200000159", "TARA_A200000113", "TARA_X000001036", "TARA_X000000950", "TARA_S200000501", "TARA_A100000164", "TARA_E500000081", "TARA_E500000075", "TARA_E500000331", "TARA_E500000178", "TARA_A100001011", "TARA_A100001015", "TARA_A100001388", "TARA_A100001037", "TARA_A100001035", "TARA_A100001234", "TARA_B100000029", "TARA_B100000003", "TARA_B100000035", "TARA_Y100000022", "TARA_B100000315", "TARA_B100000073", "TARA_Y100000294", "TARA_Y100000287", "TARA_B100000085", "TARA_Y100000031", "TARA_B100000287", "TARA_B100000282", "TARA_B100000131", "TARA_B100000123", "TARA_B100000161", "TARA_B100000242", "TARA_B100000214", "TARA_B100000212", "TARA_B100000378", "TARA_B000000609", "TARA_B000000565", "TARA_B000000557", "TARA_B000000532", "TARA_B100000405", "TARA_B100000408", "TARA_B100000401", "TARA_B000000441", "TARA_B000000460", "TARA_B000000437", "TARA_B000000477", "TARA_B000000475", "TARA_B100000497", "TARA_B100000482", "TARA_B100000470", "TARA_B100000475", "TARA_B100000446", "TARA_B100000459", "TARA_B100000427", "TARA_B100000508", "TARA_B100000424", "TARA_B100000519", "TARA_B100000749", "TARA_B100000513", "TARA_B100000530", "TARA_B100000745", "TARA_B100000524", "TARA_B100000767", "TARA_B100000768", "TARA_B100000780", "TARA_B100000795", "TARA_B100000809", "TARA_B100000787", "TARA_B100001059", "TARA_B100001063", "TARA_B100001057", "TARA_B100000989", "TARA_B100001029", "TARA_B100001013", "TARA_B100001027", "TARA_B100000886", "TARA_B100000965", "TARA_B100000959", "TARA_B100000963", "TARA_B100000902", "TARA_B100000953", "TARA_B100000900", "TARA_B100000927", "TARA_B100000929", "TARA_B100000925", "TARA_B100001113", "TARA_B100001079", "TARA_B100001109", "TARA_B100000579", "TARA_B100000586", "TARA_B100000575", "TARA_B100000945", "TARA_B100000949", "TARA_B100000941", "TARA_B100000700", "TARA_B100000678", "TARA_B100001115", "TARA_B100000686", "TARA_B100000683", "TARA_B100000676", "TARA_B100000674", "TARA_B100001123", "TARA_B100001121", "TARA_B100000614", "TARA_B100000609", "TARA_B100001250", "TARA_B100001245", "TARA_B100001248", "TARA_B100001094", "TARA_B100001105", "TARA_B100001093", "TARA_B100001964", "TARA_B100001971", "TARA_B100001287", "TARA_B100001996", "TARA_B100002003", "TARA_B100001989", "TARA_B100002019", "TARA_B100001939", "TARA_B100002052", "TARA_B100002049", "TARA_B100002051", "TARA_B100001146", "TARA_B100001142", "TARA_B100001167", "TARA_B100001540", "TARA_B100001741", "TARA_B100001750", "TARA_B100001765", "TARA_B100001758", "TARA_B100001778", "TARA_B100001769", "TARA_B100001559", "TARA_B100001564", "TARA_B100001179", "TARA_B100001175", "TARA_B100001173", "TARA_B110000008", "TARA_B110000014", "TARA_B110000003", "TARA_B110001452", "TARA_B110001454", "TARA_B110001450", "TARA_B110000196", "TARA_B110001469", "TARA_B110000046", "TARA_B110000037", "TARA_B110000027", "TARA_B110000093", "TARA_B110000091", "TARA_B110000090", "TARA_B110000114", "TARA_B110000116", "TARA_B110000208", "TARA_B110000503", "TARA_B110000211", "TARA_B110000240", "TARA_B110000238", "TARA_B110000261", "TARA_B110000263", "TARA_B110000259", "TARA_B110000285", "TARA_B110000977", "TARA_B110000967", "TARA_B110000971", "TARA_B110000305", "TARA_B110000908", "TARA_B110000914", "TARA_B110000902", "TARA_B110000881", "TARA_B110000879", "TARA_B110000495", "TARA_B110000483", "TARA_B110000858", "TARA_B110000467", "TARA_B110000459", "TARA_B110000438", "TARA_B110000444"]

Error originated just after this operation:
DF ["OMRGC_ID", "TARA_X000000368", "TARA_Y200000002", "TARA_A200000159"]; PROJECT */181 COLUMNS; SELECTION: "None"

**Normalization**

In [None]:
t0=time.time()
n = s_t.drop("PANGAEA.sample.id")
N = StandardScaler().fit_transform(n)
t1=time.time()
print(f"Normalization time = {str(timedelta(seconds=round(t1-t0)))}")

In [None]:
t0=time.time()
umap_model_2D = umap.UMAP(n_neighbors=40, n_components=2, min_dist=0.1, spread=1.0, random_state=42)
umap_2D = umap_model_2D.fit_transform(N)
t1=time.time()
print(f"Process time = {str(timedelta(seconds=round(t1-t0)))}")

In [None]:
figure_2D = px.scatter(
    umap_2D, x=0, y=1, color=m['Layer'],
    labels={"0": "Dimension 1", "1": "Dimension 2"},
    template="plotly_white", width=1000, height=800, opacity=.6)

figure_2D.update_traces(marker_size=8)
figure_2D.update_layout(legend_itemsizing="constant", legend_font_size=15, font=dict(size=12))
figure_2D.show()

In [None]:
t0=time.time()
umap_model_3D = umap.UMAP(n_neighbors=40,n_components=3, min_dist=0.1, spread=1.0,random_state=42)
umap_3D = umap_model_3D.fit_transform(N)
t1=time.time()
print(f"Process time = {str(timedelta(seconds=round(t1-t0)))}")

In [None]:
figure_3D = px.scatter_3d(
    umap_3D, x=0, y=1, z=2, color=m['Layer'],
    labels={"0": "Dimension 1", "1": "Dimension 2", "2": "Dimension 3"},
    template="plotly_white", width=1000, height=800, opacity=.5)

figure_3D.update_traces(marker_size=11)
figure_3D.update_layout(legend_itemsizing="constant", legend_font_size=10, font=dict(size=12))
figure_3D.show()