### Important Imports

In [1]:
from data_loader import load_pathway_data
from data_loader import load_gene_families
from data_loader import intersect
from data_loader import union
from data_loader import align_tensor
from scipy.io import mmread
import pandas as pd
import numpy as np

### Convert Sparse Matrix to CSV

* **Purpose:** Loads the main gene abundance data from a sparse `.mtx` file, attaches the corresponding row (`rowsnames...`) and column (`colsnames...`) names, and saves the result as a single, large, complete CSV file (`gene_abundance_absolute.csv`) in the `processed_data/` directory.

In [4]:
# Convert sparse matrix to DataFrame and save as CSV
matrix = mmread("/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/data/gene_abundance_absolute.mtx").toarray()
cols_names_file = "/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/data/colsnames_metadata_03_08.txt"
rows_names_file = "/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/data/rowsnames_metadata_03_08.txt"
output_path = "/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_absolute.csv"
row_names = []
col_names = []

# Read row and column names
with open(rows_names_file, "r") as f:
    row_names = [line.strip() for line in f]

with open(cols_names_file, "r") as f:
    col_names=[line.strip() for line in f]
    
# Check dimensions for safety
if matrix.shape[0] != len(row_names):
    raise ValueError(f"Number of rows in matrix ({matrix.shape[0]}) does not match number of row names ({len(row_names)})")

if matrix.shape[1] != len(col_names):
    raise ValueError(f"Number of columns in matrix ({matrix.shape[1]}) does not match number of column names ({len(col_names)})")

# Convert to DataFrame
df = pd.DataFrame(matrix, index=row_names, columns=col_names)

# Save as CSV
df.to_csv(output_path)

### Defines a helper function that filters a large abundance DataFrame (like the one from Cell 2) to include only samples (columns) that are marked as 'stool' in the metadata file.
### I dont know if they use it at the end

In [None]:
# def filter_stool_samples(metadata_path, df_path, output_path):
#     """
#     Filters abundance table to include only stool samples.
#     """
#     # Load metadata and get stool sample IDs
#     metadata = pd.read_csv(metadata_path)
#     metadata = metadata[metadata['body_site'] == 'stool']
#     stool_samples = metadata.iloc[:, 0].tolist()

#     # Load full gene abundance data
#     df = pd.read_csv(df_path, index_col=0)

#     # Filter columns (samples) to include only stool samples
#     df = df[stool_samples]

#     # Save to csv
#     df.to_csv(output_path)

In [None]:
# filter_stool_samples(
#     metadata_path="/home/pintokf/Projects/Microbium/Bacteria-Metric/data/metadata_03_08.csv",
#     df_path="/home/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_absolute.csv",
#     output_path="/home/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_stool.csv"
# )

# filter_stool_samples(
#     metadata_path="/home/pintokf/Projects/Microbium/Bacteria-Metric/data/metadata_03_08.csv",
#     df_path="/home/pintokf/Projects/Microbium/Bacteria-Metric/data/pathway_abundance_03_08.csv",
#     output_path="/home/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/pathway_abundance_stool.csv"
# )

In [5]:
import pandas as pd

def filter_uniref_to_bacteria(rel_abundance_path, uniref_path, output_path, format="csv"):
    """
    Filters the UniRef file to retain only rows corresponding to bacterial taxa,
    based on a relative abundance file where taxa names are in the columns.

    Parameters:
    - rel_abundance_path: str – path to relative abundance file (taxa in columns, samples in rows)
    - uniref_path: str – path to UniRef file (taxa in index, samples in columns)
    - output_path: str – path to save filtered file containing only bacterial taxa
    """

    rel_abundance = pd.read_csv(rel_abundance_path, index_col=0)

    if format=="csv":
        uniref_abundance = pd.read_csv(uniref_path, index_col=0)
    elif format=="tsv":
        uniref_abundance = pd.read_csv(uniref_path, index_col=0, sep='\t')
    else: 
        raise ValueError("Unsupported format. Use 'csv' or 'tsv'.")

    bacteria_taxa = [col for col in rel_abundance.columns if col.startswith("k__Bacteria")]

    def extract_genus_species(taxon):
        parts = taxon.split('|')
        genus = next((p for p in parts if p.startswith("g__")), "")
        species = next((p for p in parts if p.startswith("s__")), "")
        if genus and species:
            return f"{genus}.{species}"
        return None

    valid_taxa = set(filter(None, [extract_genus_species(t) for t in bacteria_taxa]))
    uniref_index_taxa = uniref_abundance.index.to_series().str.extract(r'\|(g__.*?\.s__.*)')[0].fillna("")
    is_bacteria = uniref_index_taxa.isin(valid_taxa)
    uniref_bacteria = uniref_abundance[is_bacteria]
    uniref_bacteria.to_csv(output_path)

In [6]:
filter_uniref_to_bacteria(
    rel_abundance_path="/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/data/relative_abundance.csv",
    uniref_path="/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_absolute.csv",
    output_path="/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_absolute_bacteria.csv"
)

filter_uniref_to_bacteria(
    rel_abundance_path="/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/data/relative_abundance.csv",
    uniref_path="/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/data/pathway_abundance_03_08.csv",
    output_path="/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/pathway_abundance_bacteria.csv"
)

### before running humannn regroup table script the format should be a tsv file

In [7]:
# convert .csv to .tsv
! tr ',' '\t' < /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_absolute_bacteria.csv > /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_absolute_bacteria.tsv

בזמן שאני מריץ את הפקודה צריכה להיות מופעלת סביבה שעליה מותקנת החבילה 

In [8]:
# Regroup gene abundance table to GO terms
!humann_regroup_table \
  --input /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_absolute_bacteria.tsv \
  --custom /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/humann_databases/utility_mapping/map_go_uniref90.txt.gz \
  --output /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_bacteria_regroup.tsv

Loading table from: /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_absolute_bacteria.tsv
  This is a large file, one moment please...
  Treating /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_absolute_bacteria.tsv as stratified output, e.g. ['UniRef90_A0A078R3T2', 'g__Bacteroides.s__Bacteroides_vulgatus']
Loading custom groups file: /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/humann_databases/utility_mapping/map_go_uniref90.txt.gz
Loading mapping file from: /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/humann_databases/utility_mapping/map_go_uniref90.txt.gz
  This is a large file, one moment please...
Original Feature Count: 1484697; Grouped 1+ times: 860753 (58.0%); Grouped 2+ times: 542281 (36.5%)


עכשיו נעביר שוב מ tsv לcsv

In [9]:
# convert .csv to .tsv
! tr '\t' ','  < /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_bacteria_regroup.tsv > /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_bacteria_regroup.csv

In [10]:
# Stage 1: Process gene abundance data
# Remove unclassified microbs and unmapped reads

input_path = "/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_bacteria_regroup.csv"
output_dir = "/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_bacteria_regroup_normalized.csv"

load_gene_families(input_path, output_dir, threshold=False, top_k=1000)

Processing rows: 100%|██████████| 569906/569906 [02:16<00:00, 4163.48it/s]


Tensor shape before normalization: torch.Size([748, 600, 5042])
Tensor shape after normalization: torch.Size([748, 600, 5042])
Sample sums after normalization (should all be 1.0): tensor([0.6452, 0.6331, 1.1968, 1.2634, 1.0564], dtype=torch.float64)


In [11]:
# write to a file the bacteria names
names = np.load("/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_bacteria_regroup_normalized.csv/bacteria_list.npy", allow_pickle=True)
with open("/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_bacteria_regroup_normalized.csv/bacteria_list.txt", "w") as f:
    for name in names:
        f.write(f"{name}\n")


In [12]:
# Stage 2: load appropriate pathways.

input_path = "/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/pathway_abundance_bacteria.csv"
output_dir = "/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/pathways_processed"

load_pathway_data(input_path, output_dir)

torch.Size([748, 540, 380])


In [13]:
# Sanity check for the tensor. 
bacteria_list = np.load("/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/pathways_processed/bacteria_list.npy", allow_pickle=True)
people_list = np.load("/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/pathways_processed/sample_list.npy", allow_pickle=True)
pathway_list = np.load("/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/pathways_processed/pathway_list.npy", allow_pickle=True)
tensor = np.load("/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/pathways_processed/tensor.npy")

# find the indices for the sanity check
t1 = list(people_list).index('SRS011061')
t2 = list(bacteria_list).index('g__Bacteroides.s__Bacteroides_cellulosilyticus')
t3 = list(pathway_list).index('DTDPRHAMSYN-PWY: dTDP-L-rhamnose biosynthesis I')
"""
t1 = np.where(people_list == 'ERR9830187')[0][0]
t2 = np.where(bacteria_list == 'g__Escherichia.s__Escherichia_coli')[0][0]
t3 = np.where(gene_family_list == 'GO:0000006')[0][0]
"""
print(f"the value for the sanity check is: {tensor[t1][t2][t3]}")

the value for the sanity check is: 0.000110522


not relevant for now, optional for multy datasets


In [None]:
# Stage 3 (optional): Given n tensors from the shape: [samples, bacteria, pathways] perform an union.

#input_path = ["data/data_files/pathways/AsnicarF_2017_march", "data/data_files/pathways/AsnicarF_2021_march"]
#output_dir = "data/processed_data/pathways/Union/"
#union(input_path, output_dir, is_pathway=True)

In [None]:
# Stage 4 (optional): Given n tensors from the shape: [samples, bacteria, gene_families] perform an union.

#input_path = ["data/data_files/gene_families/AsnicarF_2017_march", "data/data_files/gene_families/AsnicarF_2021_march"]
#output_dir = "data/data_files/gene_families/Union/"
#union(input_path, output_dir, is_pathway=False)

In [14]:
# Stage 5: intersect between pathway abundances and gene families through the samples and bacteria dimensions."

raw_gene_families= "/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_bacteria_regroup_normalized.csv"
raw_pathways= "/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/pathways_processed"
intersected_gene_families="/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/gene_abundance_bacteria_regroup_normalized.csv/after_intersection"
intersected_pathways="/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/pathways_processed/after_intersection"
intersect(raw_gene_families, raw_pathways, intersected_gene_families, intersected_pathways)

Intersected gene families tensor shape: (748, 540, 5042)
Complementary gene families tensor shape: (748, 60, 5042)
Intersected pathways tensor shape: (748, 540, 380)


### We will not use it now, only after we will train the model we will come back to run this cells

In [2]:

def expand_taxonomies(npy_path, rel_abundance_path, output_path=None):
    # Load the short-format taxonomy names (g__...s__...) from .npy
    short_taxa = np.load(npy_path, allow_pickle=True)

    # Load the relative abundance file (columns are full taxonomies)
    rel_abundance = pd.read_csv(rel_abundance_path, index_col=0)
    full_taxa = rel_abundance.columns.tolist()

    # Build a lookup dictionary: g__...s__... → full_taxonomy
    mapping = {}
    for full in full_taxa:
        parts = full.split('|')
        genus = next((p for p in parts if p.startswith('g__')), '')
        species = next((p for p in parts if p.startswith('s__')), '')
        if genus and species:
            key = f"{genus}.{species}"
            mapping[key] = full

    # Map the short names to full taxonomy
    expanded = []
    for name in short_taxa:
        full_name = mapping.get(name, None)
        if full_name:
            expanded.append(full_name)
        else:
            print(f"⚠️ No match found for: {name}")
            expanded.append(name)  # optionally leave as-is or use None

    expanded = np.array(expanded)

    # Save if output path provided
    if output_path:
        np.save(output_path, expanded)
        print(f"✅ Saved expanded taxonomy to {output_path}")
    
    return expanded


In [3]:
expanded_taxa = expand_taxonomies(
    npy_path="/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/eval_results/HMP_Kfir/Run_0/test_bacteria.npy",
    rel_abundance_path="/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/data/relative_abundance.csv",
    output_path="/home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/bacteria_names_full_taxonomy.npy"
)

✅ Saved expanded taxonomy to /home/dsi/pintokf/Projects/Microbium/Bacteria-Metric/processed_data/bacteria_names_full_taxonomy.npy
