In [14]:
import numpy as np
import pandas as pd 
import bokeh.io
import bokeh.plotting
import scikit_posthocs as posthoc

bokeh.io.output_notebook()

import urllib.request
# urllib.request.urlretrieve('https://raw.githubusercontent.com/charlesincharge/Caltech-CS155-2022/main/sets/set3/boosting_helper.py', 'boosting_helper.py')

# Analysis required to plot figure 2



In [15]:
df_infant_pool_raw = pd.read_csv("raw-data/fig2.csv")
df_infant_pool_raw.head()

Unnamed: 0,variant,animal id,age,tissue id,xna,route,tissue,count
0,AAV9,RM-001,infant,RM-001_4,DNA,iv,brain,25653
1,AAV9,RM-001,infant,RM-001_8,DNA,iv,brain,33812
2,AAV9,RM-001,infant,RM-001_11,DNA,iv,brain,52424
3,AAV9,RM-001,infant,RM-001_4,RNA,iv,brain,4066
4,AAV9,RM-001,infant,RM-001_8,RNA,iv,brain,18210


Normalize the read count relative to the sum of all counts in each tissue sample. This includes the `virus` sample, which is the recovered viral genomes taken from the pooled virus used for injection for this particular experiment. 

In [16]:
df_infant_pool_raw['normalized count'] = df_infant_pool_raw.groupby(['animal id', 'xna', 'tissue', 'tissue id'])['count'].transform(lambda x: x / x.sum())
enrich_grouped=df_infant_pool_raw.loc[df_infant_pool_raw['tissue']=='virus'][['variant', 'normalized count']]
enrich_grouped.rename(columns={'normalized count': 'virus normalized count'}, inplace=True)

df_infant_pool = pd.merge(df_infant_pool_raw, enrich_grouped,  on=['variant'])
df_infant_pool.head()

Unnamed: 0,variant,animal id,age,tissue id,xna,route,tissue,count,normalized count,virus normalized count
0,AAV9,RM-001,infant,RM-001_4,DNA,iv,brain,25653,0.103894,0.146356
1,AAV9,RM-001,infant,RM-001_8,DNA,iv,brain,33812,0.076046,0.146356
2,AAV9,RM-001,infant,RM-001_11,DNA,iv,brain,52424,0.078973,0.146356
3,AAV9,RM-001,infant,RM-001_4,RNA,iv,brain,4066,0.047577,0.146356
4,AAV9,RM-001,infant,RM-001_8,RNA,iv,brain,18210,0.054123,0.146356


Calculate enrichment as the ratio of `normalized count` over `virus normalized count`

In [17]:
df_infant_pool['enrichment'] = df_infant_pool['normalized count'] / df_infant_pool['virus normalized count']
df_infant_pool.head()

Unnamed: 0,variant,animal id,age,tissue id,xna,route,tissue,count,normalized count,virus normalized count,enrichment
0,AAV9,RM-001,infant,RM-001_4,DNA,iv,brain,25653,0.103894,0.146356,0.709874
1,AAV9,RM-001,infant,RM-001_8,DNA,iv,brain,33812,0.076046,0.146356,0.5196
2,AAV9,RM-001,infant,RM-001_11,DNA,iv,brain,52424,0.078973,0.146356,0.539596
3,AAV9,RM-001,infant,RM-001_4,RNA,iv,brain,4066,0.047577,0.146356,0.32508
4,AAV9,RM-001,infant,RM-001_8,RNA,iv,brain,18210,0.054123,0.146356,0.369802


Express the enrichment as the fold-change relative to the mean AAV9 enrichment of the entire tissue for either DNA or RNA.

In [18]:
# Initialize empty DataFrame that we will populate
df_fold_change = pd.DataFrame()

for _, group in df_infant_pool.groupby(['tissue', 'xna']):
    mean_aav9_enrichment = group.loc[group['variant']=='AAV9']['enrichment'].mean()
    group['fold change'] = group['enrichment'] / mean_aav9_enrichment
    
    # df_delta = df_delta.append(group)
    df_fold_change = pd.concat([df_fold_change, group])

######## Make some changes for neatness ############
# Specify columns as categorical and set order
df_fold_change['variant'] = pd.Categorical(df_fold_change['variant'], ['AAV9', 'AAV-PHP.eB', 'AAV.CAP-B2', 'AAV.CAP-B10',
                                                                       'AAV.CAP-B22', 'AAV9.452sub-LUNG1', 'AAV.CAP-Mac', 'AAV.CAP-C2'])
df_fold_change['animal id'] = pd.Categorical(df_fold_change['animal id'], ['RM-001', 'RM-002', 'virus'])
df_fold_change['xna'] = pd.Categorical(df_fold_change['xna'], ['DNA', 'RNA'])

# Sort and reset index
df_fold_change = df_fold_change.sort_values(['variant', 'animal id']).reset_index(drop=True)
######## Make some changes for neatness ############

df_fold_change.head()

Unnamed: 0,variant,animal id,age,tissue id,xna,route,tissue,count,normalized count,virus normalized count,enrichment,fold change
0,AAV9,RM-001,infant,RM-001_4,DNA,iv,brain,25653,0.103894,0.146356,0.709874,1.440737
1,AAV9,RM-001,infant,RM-001_8,DNA,iv,brain,33812,0.076046,0.146356,0.5196,1.054564
2,AAV9,RM-001,infant,RM-001_11,DNA,iv,brain,52424,0.078973,0.146356,0.539596,1.095147
3,AAV9,RM-001,infant,RM-001_4,RNA,iv,brain,4066,0.047577,0.146356,0.32508,0.820363
4,AAV9,RM-001,infant,RM-001_8,RNA,iv,brain,18210,0.054123,0.146356,0.369802,0.933223


Calculate the mean and standard error.

In [19]:
cols = ['variant', 'animal id', 'tissue', 'tissue id', 'xna', 'fold change']
df_fold_change = df_fold_change[(df_fold_change['animal id']!='virus')][cols].reset_index(drop=True)


df_fold_change['cats'] = df_fold_change.apply(lambda x: (x['xna'], x['variant']), axis=1)
df_stats = df_fold_change.groupby(['tissue', 'variant', 'xna']).agg(mean=('fold change', 'mean'), std=('fold change', 'std'), n=('fold change', 'count'))
df_stats['mean fold change'] = df_stats['mean']
df_stats['upper'] = df_stats['mean'] + df_stats['std'] / np.sqrt(df_stats['n'])
df_stats['lower'] = df_stats['mean'] - df_stats['std'] / np.sqrt(df_stats['n'])

df_plot = pd.merge(df_fold_change, df_stats[['mean fold change', 'upper', 'lower']], on=['tissue', 'variant', 'xna'])
df_plot = df_plot.sort_values(by=["xna", "variant"])
df_plot.head()

Unnamed: 0,variant,animal id,tissue,tissue id,xna,fold change,cats,mean fold change,upper,lower
0,AAV9,RM-001,brain,RM-001_4,DNA,1.440737,"(DNA, AAV9)",1.0,1.105906,0.894094
1,AAV9,RM-001,brain,RM-001_8,DNA,1.054564,"(DNA, AAV9)",1.0,1.105906,0.894094
2,AAV9,RM-001,brain,RM-001_11,DNA,1.095147,"(DNA, AAV9)",1.0,1.105906,0.894094
3,AAV9,RM-002,brain,RM-002_4,DNA,0.765429,"(DNA, AAV9)",1.0,1.105906,0.894094
4,AAV9,RM-002,brain,RM-002_6,DNA,0.745074,"(DNA, AAV9)",1.0,1.105906,0.894094


In [42]:
df_brain_plot = df_plot[df_plot['tissue']=="brain"]
source_error = bokeh.models.ColumnDataSource(data=df_brain_plot)

x_range = list(df_brain_plot['cats'].unique())
animal_id = list(df_brain_plot['animal id'].unique())
marker_colors = list(bokeh.palettes.Colorblind[5])
markers = ['circle', 'triangle', 'square', 'diamond', 'plus']

############### PLOT SETTINGS ###################
figure_width = 700
figure_height = 350
bar_width = .75
bar_fill_color='gray'
jitter_width = 0.3
marker_size = 10
bar_line_width = 1

error_size=5
error_line_width=.25
marker_line_width=1
############### PLOT SETTINGS ###################


p = bokeh.plotting.figure(x_range=bokeh.models.FactorRange(*x_range), title="Brain AAV barcode quantification", 
                          height=figure_height, width=figure_width)

############### PLOT SETTINGS ###################
p.xgrid.visible=False
p.axis.minor_tick_line_width=0
p.xaxis.major_label_orientation=np.pi/4
p.axis.major_tick_in = 0
p.axis.major_label_text_color="#000000"
p.axis.major_label_text_font_size="12pt"
p.xaxis.group_text_font_size="12pt"
p.xaxis.group_text_color="#000000"
p.axis.major_label_standoff=5
p.add_layout(bokeh.models.Legend(), "right")
############### PLOT SETTINGS ###################

w = bokeh.models.Whisker(source=source_error, base='cats', upper='upper', lower='lower', level='overlay', line_width=error_line_width)

w.upper_head.line_width=error_line_width
w.upper_head.size=error_size
w.lower_head.line_width=error_line_width
w.lower_head.size=error_size

p.vbar(source=df_brain_plot[["cats", "mean fold change"]].drop_duplicates(), x='cats', top='mean fold change',
       width=bar_width, line_color='black', fill_color=bar_fill_color, line_width=bar_line_width)

p.scatter(source=df_brain_plot, x=bokeh.transform.jitter('cats', width=jitter_width, range=p.x_range), y='fold change', 
          marker=bokeh.transform.factor_mark('animal id', markers, animal_id), size=marker_size,
          color=bokeh.transform.factor_cmap('animal id', marker_colors, animal_id), legend_field="animal id")

p.add_layout(w)

y_max = np.max([df_brain_plot['upper'], df_brain_plot['fold change']])
y_upper = np.floor(y_max) - np.floor(y_max)%2 + 2

p.y_range = bokeh.models.Range1d(0, y_upper)
bokeh.io.show(p)

In [43]:
df_liver_plot = df_plot[df_plot['tissue']=="liver"]
source_error = bokeh.models.ColumnDataSource(data=df_liver_plot)


p = bokeh.plotting.figure(x_range=bokeh.models.FactorRange(*x_range), title="Liver AAV barcode quantification", 
                          height=figure_height, width=figure_width)

#################
p.xgrid.visible=False
p.axis.minor_tick_line_width=0
p.xaxis.major_label_orientation=np.pi/4
p.axis.major_tick_in = 0
p.axis.major_label_text_color="#000000"
p.axis.major_label_text_font_size="12pt"
p.xaxis.group_text_font_size="12pt"
p.xaxis.group_text_color="#000000"
p.axis.major_label_standoff=5
p.add_layout(bokeh.models.Legend(), "right")
#################

w = bokeh.models.Whisker(source=source_error, base='cats', upper='upper', lower='lower', level='overlay', line_width=error_line_width)

w.upper_head.line_width=error_line_width
w.upper_head.size=error_size
w.lower_head.line_width=error_line_width
w.lower_head.size=error_size

p.vbar(source=df_liver_plot[["cats", "mean fold change"]].drop_duplicates(), x='cats', top='mean fold change',
       width=bar_width, line_color='black', fill_color=bar_fill_color, line_width=bar_line_width)

p.scatter(source=df_liver_plot, x=bokeh.transform.jitter('cats', width=jitter_width, range=p.x_range), y='fold change', 
          marker=bokeh.transform.factor_mark('animal id', markers, animal_id), size=marker_size,
          color=bokeh.transform.factor_cmap('animal id', marker_colors, animal_id), legend_field="animal id")

p.add_layout(w)

y_max = np.max([df_liver_plot['upper'], df_liver_plot['fold change']])
y_upper = np.floor(y_max) - np.floor(y_max)%2 + 2

p.y_range = bokeh.models.Range1d(0, y_upper)
bokeh.io.show(p)

In [32]:
df_brain_dna = df_brain_plot.loc[df_brain_plot["xna"]=="DNA"]

print("n = %i samples across %i animals. " % (len(df_brain_dna["tissue id"].unique()), len(df_brain_dna["animal id"].unique())))

p_val_brain_dna = posthoc.posthoc_tamhane(a=df_brain_dna, val_col="fold change", group_col="variant")
p_val_brain_dna.loc["AAV9", "AAV.CAP-Mac"]
p_val_brain_dna.loc["AAV9", "AAV.CAP-C2"]

n = 6 samples across 2 animals. 


1.0550107452100832e-06

In [33]:
df_brain_rna = df_brain_plot.loc[df_brain_plot["xna"]=="RNA"]

print("n = %i samples across %i animals. " % (len(df_brain_rna["tissue id"].unique()), len(df_brain_rna["animal id"].unique())))

p_val_brain_rna = posthoc.posthoc_tamhane(a=df_brain_rna, val_col="fold change", group_col="variant")
p_val_brain_rna.loc["AAV9", "AAV.CAP-Mac"]
p_val_brain_rna.loc["AAV9", "AAV.CAP-C2"]

n = 6 samples across 2 animals. 


0.0018957612622073627

In [35]:
df_liver_dna = df_liver_plot.loc[df_liver_plot["xna"]=="DNA"]

print("n = %i samples across %i animals. " % (len(df_liver_dna["tissue id"].unique()), len(df_liver_dna["animal id"].unique())))

p_val_liver_dna = posthoc.posthoc_tamhane(a=df_liver_dna, val_col="fold change", group_col="variant")
p_val_liver_dna.loc["AAV9", "AAV.CAP-Mac"]
p_val_liver_dna.loc["AAV9", "AAV.CAP-C2"]

n = 2 samples across 2 animals. 


0.00490684855031398

In [36]:
df_liver_rna = df_liver_plot.loc[df_liver_plot["xna"]=="RNA"]

print("n = %i samples across %i animals. " % (len(df_liver_rna["tissue id"].unique()), len(df_liver_rna["animal id"].unique())))

p_val_liver_rna = posthoc.posthoc_tamhane(a=df_liver_rna, val_col="fold change", group_col="variant")
p_val_liver_rna.loc["AAV9", "AAV.CAP-Mac"]
p_val_liver_rna.loc["AAV9", "AAV.CAP-C2"]

n = 2 samples across 2 animals. 


0.21339813612277359