In [8]:
from data_loader import load_pathway_data
from data_loader import load_gene_families
from data_loader import intersect
from data_loader import union
from data_loader import align_tensor
from data_loader import load_from_rds
from data_loader import load_from_mtx
import pandas as pd


In [10]:
from scipy.io import mmread

matrix = mmread("/home/bcrlab/barsapi1/metric/Bacteria-Metric/data/HMP_2012/gene_abundance.mtx").toarray()
cols_names_file = "/home/bcrlab/barsapi1/metric/Bacteria-Metric/data/HMP_2012/gene_abundance_colnames.txt"
rows_names_file = "/home/bcrlab/barsapi1/metric/Bacteria-Metric/data/HMP_2012/gene_abundance_rownames.txt"
output_path = "/home/bcrlab/barsapi1/metric/Bacteria-Metric/data/HMP_2012/gene_abundance.csv"
row_names = []
col_names = []

# Read row and column names
with open(rows_names_file, "r") as f:
    row_names = [line.strip() for line in f]

with open(cols_names_file, "r") as f:
    col_names=[line.strip() for line in f]
    
# Check dimensions for safety
if matrix.shape[0] != len(row_names):
    raise ValueError(f"Number of rows in matrix ({matrix.shape[0]}) does not match number of row names ({len(row_names)})")

if matrix.shape[1] != len(col_names):
    raise ValueError(f"Number of columns in matrix ({matrix.shape[1]}) does not match number of column names ({len(col_names)})")

# Convert to DataFrame
df = pd.DataFrame(matrix, index=row_names, columns=col_names)

# Save as CSV
df.to_csv(output_path)

In [11]:
# Stage 1: Filter gene families based on frequency.

input_path = "data/PRJEB53403_168_samples/genefamilies/humann_2_genefamilies.csv"
output_dir = "data/PRJEB53403_168_samples/genefamilies/before_intersection"

load_gene_families(input_path, output_dir, threshold=False, top_k=1000)

Processing rows: 100%|██████████| 323118/323118 [00:15<00:00, 20652.00it/s]


torch.Size([167, 430, 4693])


In [12]:
# Stage 2: load appropirate pathways.

input_path = "data/PRJEB53403_168_samples/pathabundance/humann_4_pathabundance.tsv"
output_dir = "data/PRJEB53403_168_samples/pathabundance/before_intersection"

load_pathway_data(input_path, output_dir)

torch.Size([167, 324, 389])


In [None]:
# Stage 3 (optional): Given n tensors from the shape: [samples, bacteria, pathways] perform an union.

input_path = ["data/data_files/pathways/AsnicarF_2017_march", "data/data_files/pathways/AsnicarF_2021_march"]
output_dir = "data/data_files/pathways/Union/"
union(input_path, output_dir, is_pathway=True)

In [None]:
# Stage 4 (optional): Given n tensors from the shape: [samples, bacteria, gene_families] perform an union.

input_path = ["data/data_files/gene_families/AsnicarF_2017_march", "data/data_files/gene_families/AsnicarF_2021_march"]
output_dir = "data/data_files/gene_families/Union/"
union(input_path, output_dir, is_pathway=False)

In [3]:
# Stage 5: intersect between pathway abundances and gene families through the samples and bacteria dimensions."

raw_gene_families= "data/PRJEB53403_168_samples/genefamilies/before_intersection"
raw_pathways= "data/PRJEB53403_168_samples/pathabundance/before_intersection"
intersected_gene_families="data/PRJEB53403_168_samples/genefamilies/after_intersection"
intersected_pathways="data/PRJEB53403_168_samples/pathabundance/after_intersection"
intersect(raw_gene_families, raw_pathways, intersected_gene_families, intersected_pathways)

Intersected gene families tensor shape: (167, 324, 4693)
Complementary gene families tensor shape: (167, 106, 4693)
Intersected pathways tensor shape: (167, 324, 389)


In [None]:
# Stage 6: load the intersection tensors and plot the data distribution.
gene_tensor_path = "data/data_files/gene_families/Intersection/tensor.npy"
pathway_tensor_path = "data/data_files/pathways/Intersection/tensor.npy"
bacteria_list_path = "data/data_files/gene_families/Intersection/bacteria_list.npy"

data_distribution(gene_tensor_path, pathway_tensor_path, bacteria_list_path)