In [5]:
# round(0.005*399)
2/399

0.005012531328320802

In [19]:
from json import dump
from pandas import read_csv, concat

df = read_csv("../data/Saltern_phylogeny.csv").set_index("User Genome")
df.index = list(map(lambda x: x.replace('.contigs__', ''), df.index))
df.index.name = "User Genome"

# Define the taxonomic ranks
ranks = ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']

# Split the Classification column into separate columns
# Assuming the Classification column contains semicolon-separated values
taxonomy_split = df['Classification'].str.split(';', expand=True)

# Ensure the number of columns matches the number of ranks
# If fewer ranks are present, fill missing ones with empty strings
taxonomy_split = taxonomy_split.reindex(columns=range(len(ranks)), fill_value='')

# Assign column names based on ranks
taxonomy_split.columns = ranks

# Clean up the values by removing the prefixes (e.g., 'd__', 'p__', etc.)
for rank in ranks:
    taxonomy_split[rank] = taxonomy_split[rank].str.replace(f'{rank[0]}__', '', regex=False)

# Combine the original dataframe with the new taxonomy columns
# taxonomy_split["MAG"] = df.index
# result = concat([df.index, taxonomy_split], axis=1)
dump(taxonomy_split.T.to_dict(), open("../data/disaggregated_taxonomy.json", "w"), indent=3)

# Save the result to a new CSV file
# result.to_csv('disaggregated_taxonomy.csv', index=False)

In [48]:
from json import load
from pandas import DataFrame, concat, set_option

set_option('display.max_columns', None)
set_option('display.max_rows', 20)

# create the merged DF
abundances_js = load(open("../data/mag_abundances.json", "r"))
taxa_js = load(open("../data/disaggregated_taxonomy.json", "r"))
# display(abundances_js)
# display(taxa_js)

abundances = DataFrame(abundances_js)
abundances.index.name = "MAG"
# display(abundances)
taxa = DataFrame(taxa_js).T
taxa.index.name = "MAG"
# display(taxa)

total_df = concat([taxa, abundances], axis=1)
display(total_df)
for col in ["domain", "phylum", "class", "order", "family", "genus", "species"]:
    total_df[col] = total_df[col].fillna("unbinned_reads")
total_df.to_csv("../data/mag_abundances_taxonomy.csv")


# calculate the total relative abundances per sample
r1_r2_columns = [col for col in total_df.columns if 'R1' in col or 'R2' in col]
total_df['unrestored'] = total_df[r1_r2_columns].sum(axis=1)
total_df['unrestored'] = total_df['unrestored'] / total_df['unrestored'].sum(axis=0)

r2a_columns = [col for col in total_df.columns if 'R2A' in col]
total_df['reference'] = total_df[r2a_columns].sum(axis=1)
total_df['reference'] = total_df['reference'] / total_df['reference'].sum(axis=0)

sf_columns = [col for col in total_df.columns if 'SF' in col ]
total_df['restored'] = total_df[sf_columns].sum(axis=1)
total_df['restored'] = total_df['restored'] / total_df['restored'].sum(axis=0)


total_df.drop(columns=r1_r2_columns + r2a_columns + sf_columns, inplace=True)
total_df = total_df.drop(["domain", "phylum", "class", "order", "genus", "species"], axis=1)
total_df = total_df.groupby("family").sum().sort_values(by=["unrestored"], ascending=False)
display(total_df.head(10)) #.plot.bar(rot=45, title="Top 10 most abundant families in unrestored samples")

total_df.drop("unbinned_reads", axis=0, inplace=True)
print(total_df['unrestored'].sum(axis=0))
total_df['unrestored'] = total_df['unrestored'] / total_df['unrestored'].sum(axis=0)
total_df['reference'] = total_df['reference'] / total_df['reference'].sum(axis=0)
total_df['restored'] = total_df['restored'] / total_df['restored'].sum(axis=0)
display(total_df.head(10)) #.plot.bar(rot=45, title="Top 10 most abundant families in unrestored samples")

Unnamed: 0_level_0,domain,phylum,class,order,family,genus,species,SF2_A_D1,SF2_A_D2,SF2_B_D1,SF2_B_D2,SF2_C_D1,SF2_C_D2,R1_A_D1,R1_A_D2,R1_B_D1,R1_B_D2,R1_C_D1,R1_C_D2,R2A_A_D1,R2A_A_D2,R2A_B_D1,R2A_B_D2,R2A_C_D1,R2A_C_D2,R2_A_D1,R2_A_D2,R2_B_D1,R2_B_D2,R2_C_D1,R2_C_D2
MAG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
Salt_Pond_MetaG_R1_A_D1_MG_DASTool_bins.concoct_out.44,Bacteria,Desulfobacterota_F,Desulfuromonadia,Desulfuromonadales,SZUA-401,,,1.615020e-05,2.603187e-05,2.649548e-04,0.000669,2.915223e-05,8.666736e-05,1.606082e-03,3.513144e-04,6.565851e-04,6.540739e-04,7.038066e-04,4.111214e-04,0.000013,9.373644e-06,1.927841e-05,1.825189e-05,2.435328e-05,2.403458e-05,1.730179e-04,1.025957e-04,4.709077e-04,1.602998e-04,2.026042e-04,0.000136
Salt_Pond_MetaG_R1_A_D1_MG_DASTool_bins.concoct_out.54,Bacteria,Proteobacteria,Gammaproteobacteria,Thiohalorhabdales,Thiohalorhabdaceae,Thiohalorhabdus,,6.572291e-06,1.965567e-05,1.479960e-04,0.000466,1.171466e-05,7.097312e-05,2.264572e-03,1.570283e-04,1.435145e-04,8.250577e-04,6.861925e-04,3.283171e-04,0.000010,1.148971e-05,1.939399e-05,1.403824e-05,2.524616e-05,2.162926e-05,4.307310e-04,4.551555e-04,1.541019e-04,7.440568e-05,5.364862e-05,0.000050
Salt_Pond_MetaG_R1_A_D1_MG_DASTool_bins.concoct_out.9,Bacteria,Firmicutes_A,Clostridia,Tissierellales,Dethiosulfatibacteraceae,UBA8670,,1.202248e-07,2.720508e-07,9.032961e-07,0.000002,5.761310e-07,2.139101e-06,2.183905e-03,2.689458e-03,1.329072e-03,1.536155e-03,5.272857e-04,1.353737e-03,0.000014,4.546917e-07,2.519600e-06,7.423017e-07,8.259131e-07,2.237510e-07,1.063832e-03,1.023537e-04,7.217087e-04,1.317825e-03,4.773552e-04,0.001232
Salt_Pond_MetaG_R1_A_D1_MG_DASTool_bins.metabat.15,Bacteria,Bacteroidota,Rhodothermia,Balneolales,Balneolaceae,Fodinibius,,2.123972e-06,8.977678e-06,5.627535e-05,0.000142,3.572012e-05,1.775904e-04,3.325915e-03,1.695130e-03,1.127024e-03,1.298435e-03,1.241317e-03,1.447438e-03,0.000002,1.556445e-06,8.252268e-06,2.074078e-06,2.678637e-06,2.237510e-06,2.647330e-04,1.012553e-04,7.059524e-04,3.099446e-04,6.079058e-04,0.000268
Salt_Pond_MetaG_R1_A_D1_MG_DASTool_bins.metabat.18,Bacteria,Bacteroidota,Rhodothermia,Balneolales,Balneolaceae,YR4-1,,3.787082e-06,1.163017e-05,1.219992e-04,0.000317,7.094093e-05,3.012079e-04,7.299975e-03,7.400481e-03,3.534851e-03,6.028596e-03,7.162436e-03,5.732411e-03,0.000006,1.797781e-05,4.373471e-05,2.962657e-05,1.292442e-05,8.670350e-06,1.166038e-03,2.792665e-04,2.674806e-03,1.453738e-03,2.918265e-03,0.001167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Salt_Pond_MetaG_R2_C_H2O_MG_DASTool_bins_metabat.29,,,,,,,,0.000000e+00,1.190222e-07,6.142413e-07,0.000001,1.152262e-07,5.854381e-07,2.741195e-06,2.381851e-06,2.205365e-06,1.409679e-06,6.630418e-06,4.415935e-06,0.000000,3.497628e-08,9.246239e-08,1.309944e-07,6.250153e-07,0.000000e+00,1.242236e-05,1.776314e-03,3.073645e-05,1.769427e-05,1.105792e-05,0.000027
Salt_Pond_MetaG_R2_restored_DShore_MG_DASTool_bins_concoct_out.12,,,,,,,,2.404497e-07,3.553664e-06,4.462283e-06,0.000008,2.112480e-06,1.265222e-04,2.068827e-07,1.912435e-07,1.823986e-07,2.062945e-07,1.785971e-07,2.196983e-07,0.000002,2.658198e-06,1.618092e-06,7.554012e-06,1.562538e-06,5.668358e-06,1.884556e-07,1.303157e-07,7.762791e-07,1.923290e-07,9.323708e-08,0.000004
Salt_Pond_MetaG_R2_restored_DShore_MG_DASTool_bins_concoct_out.37,,,,,,,,1.542885e-06,3.580869e-05,1.482490e-04,0.000420,1.926198e-05,4.484907e-04,9.516602e-05,4.384693e-05,1.266675e-04,4.268577e-05,1.687966e-04,2.084058e-04,0.000002,2.833079e-06,4.368848e-06,3.122034e-06,3.526872e-06,2.162926e-06,4.893564e-05,8.046063e-05,1.336459e-04,9.472844e-05,4.579805e-05,0.000041
Salt_Pond_MetaG_R2_restored_DShore_MG_DASTool_bins_concoct_out.40,,,,,,,,9.097013e-06,1.425036e-04,1.325208e-03,0.003780,1.794840e-04,4.094757e-03,2.354325e-04,1.313321e-04,6.002406e-04,1.137714e-04,9.196412e-04,9.447685e-04,0.000006,9.321180e-06,1.153468e-05,3.143866e-06,8.125200e-06,2.834179e-06,1.663435e-04,2.070716e-04,8.050224e-04,4.169052e-04,1.900918e-04,0.000131


Unnamed: 0_level_0,unrestored,reference,restored
family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
unbinned_reads,0.678528,0.893159,0.88897
Rhodobacteraceae,0.066088,0.002533,0.000903
Balneolaceae,0.036025,0.000146,0.009446
Wenzhouxiangellaceae,0.03104,0.000204,0.000108
,0.02625,0.018155,0.018381
Saprospiraceae,0.024237,5.7e-05,2.9e-05
Woeseiaceae,0.010418,0.002033,0.000643
UBA12077,0.010224,3.1e-05,5.8e-05
Sediminispirochaetaceae,0.008751,3.4e-05,5.8e-05
Flavobacteriaceae,0.008356,0.006775,0.002139


0.32147215656753503


Unnamed: 0_level_0,unrestored,reference,restored
family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rhodobacteraceae,0.205579,0.023704,0.008131
Balneolaceae,0.112063,0.00137,0.085078
Wenzhouxiangellaceae,0.096556,0.001906,0.000975
,0.081655,0.169926,0.165546
Saprospiraceae,0.075393,0.000535,0.000266
Woeseiaceae,0.032407,0.019029,0.005789
UBA12077,0.031803,0.000287,0.000523
Sediminispirochaetaceae,0.02722,0.000319,0.000519
Flavobacteriaceae,0.025992,0.063411,0.019261
Oleiphilaceae,0.023174,0.001592,0.000722


In [None]:
from pandas import read_csv, set_option

set_option('display.max_columns', None)
set_option('display.max_rows', None)

df = read_csv("../data/family_abundances.csv").set_index("bin")
df.drop([i for i in df.index if "H2O" in i], axis=0, inplace=True)
df2 = df.copy()
df.set_index("gtdb_o", inplace=True)
df.drop(["gtdb_d", "gtdb_p", "gtdb_c", "gtdb_f"], axis=1, inplace=True)
df = df.apply(lambda x: x.str.replace('%', '', regex=False) if x.dtype == "object" else x)
df = df.astype(float)
df = df.groupby("gtdb_o").sum()
df = df.sort_values("unrestored total", ascending=False)
# display(df)

df2.set_index("gtdb_f", inplace=True)
df2.drop(["gtdb_d", "gtdb_p", "gtdb_c", "gtdb_o"], axis=1, inplace=True)
df2 = df2.apply(lambda x: x.str.replace('%', '', regex=False) if x.dtype == "object" else x)
df2 = df2.astype(float)
df2 = df2.groupby("gtdb_f").sum()
df2 = df2.sort_values("unrestored total", ascending=False)
display(df2.head(10))

condensedDF = df2.drop([col for col in df2.columns if "total" not in col], axis=1)
condensedDF.head(10).to_csv("total_family_abundances.csv")

# Correlate the family-level abundances with the methane emissions for each sample

In [49]:
import matplotlib.pyplot as plt
from seaborn import color_palette
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline, Pipeline

from matplotlib.ticker import LogLocator, LogFormatterMathtext, NullFormatter
from sklearn.metrics import r2_score
from adjustText import adjust_text
import json
from pandas import DataFrame

# display(DataFrame(methane_sols).T.sum())

methane_dic = json.load(open("averaged_normalized_methane_unrestored.json", 'r'))
display(methane_dic)
# total_df = read_csv("../Cliff/data/Cliff_Sample_Metadata_BGC_NMR.csv").set_index("Sample")
# emissions = total_df["CH4_umol_m2_d"].loc[unrestored_cols].to_list()

combined = {ID: {"fluxes": emissions, "methane": methane_dic[ID]} for ID, emissions in fluxes_in_emissions.items()}
combined_df = DataFrame(combined).T
methane_ary = np.array(list(combined_df["methane"].values))
emission_fluxes = np.array(list(combined_df["fluxes"].values))

# plot the figure
fig, ax = plt.subplots(figsize=(8, 6), dpi=300)
xs = np.array(methane_ary).reshape(-1, 1)
ys = np.array(emission_fluxes)
if log:
    xs = np.log(xs)
    ys = np.log(ys)
    title = r"Methane emissions $\left[LOG\left(\frac{umol~~CH_4}{m^2*day*\left(\frac{ug DNA}{g soil}\right)}\right)\right]$"
else:
    ax.set_xscale('log'); ax.set_yscale('log')

    # Major ticks = decades
    ax.xaxis.set_major_locator(LogLocator(base=10, subs=(1.0,)))
    ax.yaxis.set_major_locator(LogLocator(base=10, subs=(1.0,)))

    # 9 log-spaced minor ticks per decade
    ax.xaxis.set_minor_locator(LogLocator(base=10, subs=np.arange(2, 10), numticks=100))
    ax.yaxis.set_minor_locator(LogLocator(base=10, subs=np.arange(2, 10), numticks=100))

    # Label only 10^n
    fmt = LogFormatterMathtext(base=10, labelOnlyBase=True)
    ax.xaxis.set_major_formatter(fmt)
    ax.yaxis.set_major_formatter(fmt)
    ax.xaxis.set_minor_formatter(NullFormatter())
    ax.yaxis.set_minor_formatter(NullFormatter())
    plt.setp(ax.get_xticklabels(), rotation=90)
    title = r"Methane emissions $\left[\left(\frac{umol~~CH_4}{m^2*day*\left(\frac{ug DNA}{g soil}\right)}\right)\right]$"

print(xs, ys)

mdl = LinearRegression()
mdl.fit(xs, ys)
ax.scatter(xs, ys)






for family, row in total_df.iterrows():
    
    print(family, row['unrestored'], row['reference'], row['restored'])

{'R1_C': 263.0210073267493,
 'R2_B': 5308.591017144301,
 'R2_C': 5214.314339711316,
 'R2_A': 589.3269001387226,
 'R1_A': 154.04429155649183,
 'R1_B': 350.7323023585634}

NameError: name 'fluxes_in_emissions' is not defined