In [None]:
from data_loader import load_pathway_data
from data_loader import load_gene_families
from data_loader import intersect
from data_loader import union
from data_loader import align_tensor
from data_loader import data_distribution
import pandas as pd
import numpy as np

In [10]:
from scipy.io import mmread

matrix = mmread("/home/bcrlab/barsapi1/metric/Bacteria-Metric/data/HMP_2012/gene_abundance.mtx").toarray()
cols_names_file = "/home/bcrlab/barsapi1/metric/Bacteria-Metric/data/HMP_2012/gene_abundance_colnames.txt"
rows_names_file = "/home/bcrlab/barsapi1/metric/Bacteria-Metric/data/HMP_2012/gene_abundance_rownames.txt"
output_path = "/home/bcrlab/barsapi1/metric/Bacteria-Metric/data/HMP_2012/gene_abundance.csv"
row_names = []
col_names = []

# Read row and column names
with open(rows_names_file, "r") as f:
    row_names = [line.strip() for line in f]

with open(cols_names_file, "r") as f:
    col_names=[line.strip() for line in f]
    
# Check dimensions for safety
if matrix.shape[0] != len(row_names):
    raise ValueError(f"Number of rows in matrix ({matrix.shape[0]}) does not match number of row names ({len(row_names)})")

if matrix.shape[1] != len(col_names):
    raise ValueError(f"Number of columns in matrix ({matrix.shape[1]}) does not match number of column names ({len(col_names)})")

# Convert to DataFrame
df = pd.DataFrame(matrix, index=row_names, columns=col_names)

# Save as CSV
df.to_csv(output_path)

In [None]:
# Stage 1: Filter gene families based on frequency.

input_path = "../data/HMP_2012/raw/gene_abundance.tsv"
output_dir = "../data/HMP_2012/genefamilies/before_intersection"

load_gene_families(input_path, output_dir, threshold=False, top_k=1000)

642               GO:0000001|g__Candida.s__Candida_albicans
643           GO:0000001|g__Candida.s__Candida_dubliniensis
644          GO:0000001|g__Malassezia.s__Malassezia_globosa
647               GO:0000002|g__Candida.s__Candida_albicans
648           GO:0000002|g__Candida.s__Candida_dubliniensis
                                ...                        
632900      GO:2001295|g__Treponema.s__Treponema_sp_OMZ_838
632901       GO:2001295|g__Treponema.s__Treponema_vincentii
632902    GO:2001295|g__Turicibacter.s__Turicibacter_san...
632903       GO:2001295|g__Tyzzerella.s__Tyzzerella_nexilis
632904    GO:2001295|g__Victivallis.s__Victivallis_vadensis
Name: Unnamed: 0, Length: 616214, dtype: object


Processing rows: 100%|██████████| 616214/616214 [03:21<00:00, 3063.90it/s]


torch.Size([748, 640, 7867])


In [6]:
# write to a file the bacteria names
names = np.load("../data/HMP_2012/genefamilies/before_intersection/bacteria_list.npy", allow_pickle=True)
with open("../data/HMP_2012/genefamilies/bacteria_list.txt", "w") as f:
    for name in names:
        f.write(f"{name}\n")


In [3]:
# Stage 2: load appropirate pathways.

input_path = "../data/HMP_2012/raw/pathway_abundance.csv"
output_dir = "../data/HMP_2012/pathabundance/before_intersection"

load_pathway_data(input_path, output_dir)

torch.Size([748, 577, 399])


In [5]:
# Sanity check for the tensor. 
bacteria_list = np.load("../data/HMP_2012/pathabundance/before_intersection/bacteria_list.npy", allow_pickle=True)
people_list = np.load("../data/HMP_2012/pathabundance/before_intersection/sample_list.npy", allow_pickle=True)
pathway_list = np.load("../data/HMP_2012/pathabundance/before_intersection/pathway_list.npy", allow_pickle=True)
tensor = np.load("../data/HMP_2012/pathabundance/before_intersection/tensor.npy")

# find the indices for the sanity check
t1 = list(people_list).index('SRS011061')
t2 = list(bacteria_list).index('g__Bacteroides.s__Bacteroides_cellulosilyticus')
t3 = list(pathway_list).index('DTDPRHAMSYN-PWY: dTDP-L-rhamnose biosynthesis I')
"""
t1 = np.where(people_list == 'ERR9830187')[0][0]
t2 = np.where(bacteria_list == 'g__Escherichia.s__Escherichia_coli')[0][0]
t3 = np.where(gene_family_list == 'GO:0000006')[0][0]
"""
print(f"the value for the sanity check is: {tensor[t1][t2][t3]}")

the value for the sanity check is: 0.000110522


In [None]:
# Stage 3 (optional): Given n tensors from the shape: [samples, bacteria, pathways] perform an union.

input_path = ["data/data_files/pathways/AsnicarF_2017_march", "data/data_files/pathways/AsnicarF_2021_march"]
output_dir = "data/data_files/pathways/Union/"
union(input_path, output_dir, is_pathway=True)

In [None]:
# Stage 4 (optional): Given n tensors from the shape: [samples, bacteria, gene_families] perform an union.

input_path = ["data/data_files/gene_families/AsnicarF_2017_march", "data/data_files/gene_families/AsnicarF_2021_march"]
output_dir = "data/data_files/gene_families/Union/"
union(input_path, output_dir, is_pathway=False)

In [7]:
# Stage 5: intersect between pathway abundances and gene families through the samples and bacteria dimensions."

raw_gene_families= "../data/HMP_2012/genefamilies/before_intersection"
raw_pathways= "../data/HMP_2012/pathabundance/before_intersection"
intersected_gene_families="../data/HMP_2012/genefamilies/after_intersection"
intersected_pathways="../data/HMP_2012/pathabundance/after_intersection"
intersect(raw_gene_families, raw_pathways, intersected_gene_families, intersected_pathways)

Intersected gene families tensor shape: (748, 577, 7867)
Complementary gene families tensor shape: (748, 63, 7867)
Intersected pathways tensor shape: (748, 577, 399)
