In [8]:
import pandas as pd
import altair as alt
import numpy as np

In [9]:
prefix="../../results/23_12_18/"

In [10]:
def bracken2df(infile, threshold):
    df = pd.read_table(infile)
    # only keep bracken fraction columns & species name
    df = df[df.columns.drop(list(df.filter(regex="bracken_num$|^taxonomy")))]

    # all fractions below threshold summed up to a value for 'other' per sample
    sample_ls = list(df.filter(regex="bracken_frac"))
    other_row = ["other"]
    for sample in sample_ls:
        other_row.append(df.loc[df[sample] < threshold, sample].sum())
        # values below threshold set to NaN
        df.loc[df[sample] < threshold, sample] = np.nan
    df.sort_values(["name"], inplace=True)
    df.loc[-1] = other_row

    # rows with NaN for all samples are removed
    df = df.dropna(subset=sample_ls, how="all")

    df.columns = df.columns.str.replace(".bracken_frac", "", regex=False)

    column_ls = list(df.columns)
    column_ls[0] = "sample"

    # transpose df, change index & column names to get format for plotting
    df_plot = df.transpose()
    df_plot.reset_index(inplace=True)
    df_plot["index"][0] = "sample"
    df_plot.columns = df_plot.iloc[0]
    df_plot.drop(df_plot.index[0], inplace=True)
    return df_plot

#infile="test/merged.bracken_class.txt"
infile=f"{prefix}report/bracken/merged.bracken_class.txt"
bracken_df = bracken2df(infile, 0.01)
bracken_df

Unnamed: 0,sample,Actinomycetes,Alphaproteobacteria,Bacilli,Bacteroidia,Betaproteobacteria,Caudoviricetes,Clostridia,Coriobacteriia,Epsilonproteobacteria,Flavobacteriia,Gammaproteobacteria,Negativicutes,Verrucomicrobiae,other
1,F01,0.03363,0.01237,0.01372,0.19754,0.06399,,0.11258,0.0173,0.31733,0.02011,0.16096,,0.01562,0.03477
2,S04,0.03109,0.01315,0.01545,0.18385,0.07103,,0.15867,0.01213,0.1484,0.01205,0.31518,0.01234,,0.02671
3,R06,0.02944,0.01202,0.02099,0.23145,0.05713,0.01488,0.18952,0.01153,0.24256,0.01608,0.12312,0.01932,,0.03202
4,J11,0.0525,0.01658,0.01496,0.3714,0.08664,,0.20161,0.02076,0.03364,0.01866,0.14201,0.01297,,0.02823
5,D05,0.01734,0.01112,,0.13087,0.09898,,0.06408,,0.11439,0.06357,0.46485,,,0.03474
6,F07,0.04652,0.0158,0.04393,0.12315,0.09591,,0.2951,0.01784,0.12701,0.01337,0.1599,0.01875,,0.04271
7,J01,0.02462,0.01019,0.01255,0.23769,0.08073,,0.1336,,0.26331,0.02323,0.16453,,,0.04951
8,S09,0.03763,0.015,0.01957,0.22524,0.09631,,0.20862,0.01388,0.09477,0.01169,0.23216,0.01519,,0.02978
9,J02,0.04392,0.02276,0.02901,0.11783,0.17474,,0.13017,0.01206,0.13067,0.03587,0.24688,0.01157,,0.04442
10,S01,0.03944,0.01784,0.01582,0.1613,0.12134,,0.1117,0.01477,0.16876,0.01355,0.29501,0.01091,,0.02949


In [None]:
melt_df=bracken_df.melt(id_vars=['sample'],var_name='Class', value_name='share')
print(melt_df)

In [12]:
bars=alt.Chart(melt_df,title="Relative abundance of bacterial classes").mark_bar().transform_calculate(
    combined_tooltip = "datum.Class + ': ' + format(datum.share, '.2%')"
).encode(
    alt.X('sample:N').axis(labelFontSize=12, titleFontSize=15).title('Sample'),
    alt.Y('sum(share)',stack='normalize').axis(format='%',labelFontSize=12, titleFontSize=15).title('Relative abundance'),
    color=alt.Color('Class'),
    tooltip='combined_tooltip:N',
)
bars=bars.configure_legend(titleFontSize=15,labelFontSize=12,labelFontStyle="italic").configure_title(fontSize=18)
#bars=bars

'''text=alt.Chart().mark_text(dx=-15, dy=3).encode(
    x=alt.X('sample:N'),
    y=alt.Y('sum(value)',stack='zero'),
    color=alt.Color('variable', legend=None, scale=alt.Scale(range=['black'])),
    #tooltip=['sample', 'variable', alt.Tooltip('value:Q', format='.2%')],
    #text=alt.Text('sum(value)',format='.2%')
)'''

#base_chart=alt.layer(bars, data=melt_df).resolve_scale(color='independent')
bars.save(f'{prefix}report/class_distribution.html')

In [13]:
infile=f"{prefix}report/bracken/merged.bracken_phylum.txt"
bracken_df = bracken2df(infile, 0.01)
melt_df=bracken_df.melt(id_vars=['sample'],var_name='Phylum', value_name='share')
print(melt_df)

    sample          Phylum    share
0      F01  Actinomycetota  0.05112
1      S04  Actinomycetota   0.0434
2      R06  Actinomycetota  0.04106
3      J11  Actinomycetota  0.07349
4      D05  Actinomycetota  0.02231
..     ...             ...      ...
403    R04           other   0.0213
404    R03           other  0.02843
405    J04           other   0.0223
406    R11           other  0.02024
407    D06           other  0.01379

[408 rows x 3 columns]


In [14]:
bars=alt.Chart(melt_df,title="Relative abundance of bacterial phyla").mark_bar().transform_calculate(
    combined_tooltip = "datum.Phylum + ': ' + format(datum.share, '.2%')"
).encode(
    alt.X('sample:N').axis(labelFontSize=12, titleFontSize=15).title('Sample'),
    alt.Y('sum(share)',stack='normalize').axis(format='%',labelFontSize=12, titleFontSize=15).title('Relative abundance'),
    color=alt.Color('Phylum'),
    tooltip='combined_tooltip:N',
)
bars=bars.configure_legend(titleFontSize=15,labelFontSize=12,labelFontStyle="italic").configure_title(fontSize=18)

bars.save(f'{prefix}report/phylum_distribution.html')

In [15]:
infile=f"{prefix}report/bracken/merged.bracken_genus.txt"
bracken_df = bracken2df(infile, 0.01)
melt_df=bracken_df.melt(id_vars=['sample'],var_name='Genus', value_name='share')
print(melt_df)

     sample       Genus    share
0       F01  Acidovorax  0.02958
1       S04  Acidovorax  0.03108
2       R06  Acidovorax  0.02531
3       J11  Acidovorax  0.03479
4       D05  Acidovorax  0.03749
...     ...         ...      ...
2188    R04       other  0.27176
2189    R03       other  0.31427
2190    J04       other  0.26653
2191    R11       other   0.3265
2192    D06       other  0.28683

[2193 rows x 3 columns]


In [16]:
bars=alt.Chart(melt_df,title="Relative abundance of bacterial genera").mark_bar().transform_calculate(
    combined_tooltip = "datum.Genus + ': ' + format(datum.share, '.2%')"
).encode(
    alt.X('sample:N').axis(labelFontSize=12, titleFontSize=15).title('Sample'),
    alt.Y('sum(share)',stack='normalize').axis(format='%',labelFontSize=12, titleFontSize=15).title('Relative abundance'),
    color=alt.Color('Genus'),
    tooltip='combined_tooltip:N',
)
bars=bars.configure_legend(titleFontSize=15,labelFontSize=12,labelFontStyle="italic").configure_title(fontSize=18)

bars.save(f'{prefix}report/genus_distribution.html')

In [17]:
infile=f"{prefix}report/bracken/merged.bracken_family.txt"
bracken_df = bracken2df(infile, 0.01)
melt_df=bracken_df.melt(id_vars=['sample'],var_name='Family', value_name='share')
print(melt_df)

     sample          Family    share
0       F01  Aeromonadaceae  0.03074
1       S04  Aeromonadaceae   0.0758
2       R06  Aeromonadaceae  0.03295
3       J11  Aeromonadaceae  0.02138
4       D05  Aeromonadaceae  0.06407
...     ...             ...      ...
1321    R04           other   0.1472
1322    R03           other  0.18003
1323    J04           other  0.15273
1324    R11           other  0.17709
1325    D06           other  0.15278

[1326 rows x 3 columns]


In [18]:
bars=alt.Chart(melt_df,title="Relative abundance of bacterial families").mark_bar().transform_calculate(
    combined_tooltip = "datum.Family + ': ' + format(datum.share, '.2%')"
).encode(
    alt.X('sample:N').axis(labelFontSize=12, titleFontSize=15).title('Sample'),
    alt.Y('sum(share)',stack='normalize').axis(format='%',labelFontSize=12, titleFontSize=15).title('Relative abundance'),
    color=alt.Color('Family'),
    tooltip='combined_tooltip:N',
)
bars=bars.configure_legend(titleFontSize=15,labelFontSize=12,labelFontStyle="italic").configure_title(fontSize=18)

bars.save(f'{prefix}report/family_distribution.html')

In [5]:
samples=["I15566-L1","115_L001","139_L001"]
cont_warning= 50

sum_dict={}
for sample in samples:
    sample_sum_dict = {}
    stats_path=f"../../results/23_12_18_test/contamination/stats_{sample}.txt"
    with open(stats_path, "r") as stats:
        for line in stats:
            if line.startswith("SN	sequences"):
                total=line.split(":")[-1].strip()
                continue
            elif line.startswith("SN	reads mapped"):
                mapped=line.split(":")[-1].strip()
                sample_sum_dict["reads_mapped"] =int(mapped)

                prc=(int(mapped)/int(total))
                sample_sum_dict["contamination"] = "%.6f" % prc

                if prc >= cont_warning:
                    sample_sum_dict["color"] = "#9A0430"
                else:
                    sample_sum_dict["color"] = "#6ea165"

                break

    sum_dict[sample]=sample_sum_dict

sum_df = pd.DataFrame.from_dict(sum_dict, orient="index")
#sum_df.index.name = "sample"
#sum_df.insert(0,'sample','')
#sum_df['sample'] = sum_df.index
sum_df = sum_df.reset_index()
sum_df.rename(columns={"index":"sample"}, inplace=True)
print(sum_df)

      sample  reads_mapped contamination    color
0  I15566-L1        107808      0.005900  #6ea165
1   115_L001          1068      0.000051  #6ea165
2   139_L001           260      0.000013  #6ea165


In [6]:
slider = alt.binding_range(min=0, max=100, step=0.5, name='max human contamination')
selector = alt.param(name='SelectorName', value=50, bind=slider)


base_chart=alt.Chart(sum_df,title="% human contamination").encode(
    alt.X('contamination:Q').axis(format='%',labelFontSize=12, titleFontSize=15).title('human contamination'),
    alt.Y('sample:N').axis(labelFontSize=12, titleFontSize=15),
).add_params(
   selector
).properties(width=600).interactive()


bars=base_chart.mark_bar().encode(color=alt.condition(
       alt.datum.contamination >= selector,
       alt.value('#9A0430'),
       alt.value('#6ea165')
   ))


chart_text = base_chart.mark_text(
    align='center',
    baseline='middle',
    dx=20,
    fontSize=12,
).encode(
    text=alt.Text("contamination:Q",format='.2%'),)
    #text='human contamination (%):Q')
#chart.mark_text(align='left', dx=2)
charty =bars + chart_text
jchart = alt.JupyterChart(charty)
jchart


JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'layer': [{'mark': {â€¦

In [7]:
#chart.save('chart.html')