In [1]:
import numpy as np
import pandas as pd 
import scipy
import bokeh.io
import bokeh.plotting
import scikit_posthocs as posthoc

bokeh.io.output_notebook()

import urllib.request
# urllib.request.urlretrieve('https://raw.githubusercontent.com/charlesincharge/Caltech-CS155-2022/main/sets/set3/boosting_helper.py', 'boosting_helper.py')

# Figure 6: Characterization in adult rhesus macaque

## Figure 6c - _Ex vivo_ adult macaque brain viral genomes and transcripts

In [2]:
df_slice = pd.read_csv("raw-data/fig6c.csv")

df_slice["egfp/genomic"] = df_slice["eGFP"] / df_slice["genomic control"]
df_mean=df_slice.groupby(["xna","variant"]).mean(numeric_only=True).reset_index()
df_mean = df_mean[["xna", "variant", "egfp/genomic"]].loc[df_mean["variant"]=="AAV9"]
df_mean = df_mean.rename(columns={"egfp/genomic": "mean AAV9 egfp/genomic"})
df_mean = pd.merge(df_slice, df_mean[["xna", "mean AAV9 egfp/genomic"]], on=["xna"])
df_mean["mean AAV9 egfp/genomic"] = df_mean["egfp/genomic"] / df_mean["mean AAV9 egfp/genomic"]
df_slice=df_mean.rename(columns={"mean AAV9 egfp/genomic": "fold-change over mean AAV9"})

df_mean = df_slice.groupby(["variant", "xna"])[["fold-change over mean AAV9"]].agg(mean_fold=("fold-change over mean AAV9", "mean"), 
                                                                                      std_fold=("fold-change over mean AAV9", "std"), 
                                                                                      n=("fold-change over mean AAV9", "count")).reset_index()
df_mean["error"] = df_mean["std_fold"] / df_mean["n"].apply(np.sqrt)
df_mean = pd.merge(df_slice, df_mean, on=["variant", "xna"])

df_mean["upper"] = df_mean.apply(lambda x: (x["mean_fold"] + x["error"] if x["n"] > 2 else -1), axis=1) 
df_mean["lower"] = df_mean.apply(lambda x: (x["mean_fold"] - x["error"] if x["n"] > 2 else -1), axis=1)

rng = np.random.default_rng()
df_mean['jitter'] = rng.normal(0, 0.05, len(df_mean)) 
df_mean['cats'] = df_mean.apply(lambda x: (x['xna'], x['jitter']), axis=1)

df_mean.head()

Unnamed: 0,Sample,variant,xna,eGFP,genomic control,egfp/genomic,fold-change over mean AAV9,mean_fold,std_fold,n,error,upper,lower,jitter,cats
0,Sample2_AAV9,AAV9,DNA,5068156.0,6667.496618,760.128776,1.353507,1.0,0.306248,3,0.176812,1.176812,0.823188,0.035503,"(DNA, 0.035503090801642184)"
1,Sample3_AAV9,AAV9,DNA,3400115.0,7425.49871,457.897145,0.815345,1.0,0.306248,3,0.176812,1.176812,0.823188,-0.015929,"(DNA, -0.015928762203478355)"
2,Sample4_AAV9,AAV9,DNA,3287786.0,7043.659635,466.77244,0.831148,1.0,0.306248,3,0.176812,1.176812,0.823188,-0.001652,"(DNA, -0.001651740960831617)"
3,Sample8_CAP.C1,CAP-Mac,DNA,382839.7,6890.747525,55.558517,0.098929,0.093978,0.016315,3,0.00942,0.103397,0.084558,-0.029122,"(DNA, -0.029122005077531113)"
4,Sample9_CAP.C1,CAP-Mac,DNA,359686.7,5972.090626,60.227941,0.107244,0.093978,0.016315,3,0.00942,0.103397,0.084558,-0.022408,"(DNA, -0.022407750096382184)"


In [3]:
x_range = ["DNA", "RNA"]
variant_color = bokeh.transform.factor_cmap('variant', palette=["gray", "white"], factors=["AAV9", "CAP-Mac"])

############### PLOT SETTINGS ###################
figure_width = 400
figure_height = 300
width = 0.35
dodge = width/2

marker_size = 5
error_size=4
error_line_width=0.5
############### PLOT SETTINGS ###################

p = bokeh.plotting.figure(x_range=x_range, height=figure_height, width=figure_width, title="Ex vivo adult macaque brain viral genomes and transcripts")

############### PLOT SETTINGS ###################
p.xgrid.visible=False
p.axis.minor_tick_line_width=0
p.xaxis.major_label_orientation=45
p.axis.major_tick_in = 0
p.axis.major_label_text_color = "#000000"
p.axis.axis_label_text_align = "right"
p.add_layout(bokeh.models.Legend(), "right")
############### PLOT SETTINGS ###################

p.vbar(source=df_mean.loc[df_mean['variant']=='AAV9'], x=bokeh.transform.dodge('xna', -dodge, range=p.x_range), 
       top='mean_fold', width=width, fill_color=variant_color, line_color='black', legend_label="AAV9")
p.vbar(source=df_mean.loc[df_mean['variant']=='CAP-Mac'], x=bokeh.transform.dodge('xna', dodge, range=p.x_range), 
       top='mean_fold', width=width, fill_color=variant_color, line_color='black',  legend_label="CAP-Mac")

source_error_AAV9 = bokeh.models.ColumnDataSource(data=df_mean.loc[df_mean['variant']=='AAV9'])
source_error_C1 = bokeh.models.ColumnDataSource(data=df_mean.loc[df_mean['variant']=='CAP-Mac'])

w_AAV9 = bokeh.models.Whisker(source=source_error_AAV9, base=bokeh.transform.dodge('xna', -dodge, range=p.x_range), 
                              upper='upper', lower='lower', level='overlay', line_width=error_line_width)
w_C1 = bokeh.models.Whisker(source=source_error_C1, base=bokeh.transform.dodge('xna', dodge, range=p.x_range), 
                            upper='upper', lower='lower', level='overlay', line_width=error_line_width)

w_AAV9.upper_head.line_width=error_line_width
w_AAV9.upper_head.size=error_size
w_AAV9.lower_head.line_width=error_line_width
w_AAV9.lower_head.size=error_size

w_C1.upper_head.line_width=error_line_width
w_C1.upper_head.size=error_size
w_C1.lower_head.line_width=error_line_width
w_C1.lower_head.size=error_size

p.add_layout(w_AAV9)
p.add_layout(w_C1)

p.scatter(x=bokeh.transform.dodge('cats', -dodge, range=p.x_range), y='fold-change over mean AAV9', legend_label="AAV9 ",
          source=df_mean.loc[df_mean['variant']=='AAV9'], marker="circle", size=marker_size, color="black", line_color='black', line_width=0.25)

p.scatter(x=bokeh.transform.dodge('cats', dodge, range=p.x_range), y='fold-change over mean AAV9', legend_label="CAP-Mac ",
          source=df_mean.loc[df_mean['variant']=='CAP-Mac'], marker="square", size=marker_size, color=None, line_color='black', line_width=1)


p.y_range = bokeh.models.Range1d(0, np.max(np.max([df_mean["upper"], df_mean["fold-change over mean AAV9"]]))*1.1)
bokeh.io.show(p)

### Statistics for ex vivo macaque brain viral genomes and transcripts

In [4]:
xna = []
p_val = []

for _, group in df_slice.groupby("xna"):
    a=group.loc[group["variant"]=="AAV9"]["fold-change over mean AAV9"].to_numpy()
    b=group.loc[group["variant"]=="CAP-Mac"]["fold-change over mean AAV9"].to_numpy()
    
    xna.append(group["xna"].iloc[0])
    p_val.append(scipy.stats.ttest_ind(a, b, equal_var=False)[1])
    
df_slice_stats = pd.DataFrame({"xna": xna, "p value": p_val})

print("DNA: AAV9 vs. AAV.CAP-Mac, P = %.5f." %(df_slice_stats.loc[df_slice_stats["xna"]=="DNA", "p value"]))
print("RNA: AAV9 vs. AAV.CAP-Mac, P = %.5f." %(df_slice_stats.loc[df_slice_stats["xna"]=="RNA", "p value"]))

DNA: AAV9 vs. AAV.CAP-Mac, P = 0.03571.
RNA: AAV9 vs. AAV.CAP-Mac, P = 0.05714.


## Figure 6d - _In vivo_ adult rhesus macaque pool DNA in brain (intravenous administration; 1 x 10<sup>14</sup> vg/kg total)

In [8]:
df_adult_pool_raw = pd.read_csv("raw-data/fig6d.csv")
df_adult_pool_raw['normalized count'] = df_adult_pool_raw.groupby(['animal id', 'xna', 'tissue', 'tissue id'])['count'].transform(lambda x: x / x.sum())
enrich_grouped=df_adult_pool_raw.loc[df_adult_pool_raw['tissue']=='virus'][['variant', 'normalized count']]
enrich_grouped.rename(columns={'normalized count': 'virus normalized count'}, inplace=True)

# Make enrichment DataFrame
df_enrichment = pd.merge(df_adult_pool_raw, enrich_grouped,  on=['variant'])
df_enrichment['enrichment'] = df_enrichment['normalized count'] / df_enrichment['virus normalized count']


# Initialize empty DataFrame that we will populate
df_delta = pd.DataFrame()

for _, group in df_enrichment.groupby(['tissue', 'xna']):
    mean_aav9_enrichment = group.loc[group['variant']=='AAV9']['enrichment'].mean()
    group['fold change'] = group['enrichment'] / mean_aav9_enrichment
    
    df_delta = pd.concat([df_delta, group])
    
# Specify columns as categorical and set order
df_delta['variant'] = pd.Categorical(df_delta['variant'], ['AAV9', 'AAV-PHP.eB', 'AAV.CAP-B2', 'AAV.CAP-B10', 
                                                           'AAV.CAP-B22', 'AAV9.452sub-LUNG1', 'AAV.CAP-Mac', 'AAV.CAP-C2'])
df_delta['animal id'] = pd.Categorical(df_delta['animal id'], ['RMN-001', 'RMN-002','virus'])
df_delta['xna'] = pd.Categorical(df_delta['xna'], ['DNA', 'RNA'])

# Sort and reset index
df_delta = df_delta.sort_values(['variant', 'animal id']).reset_index(drop=True)

cols = ['variant', 'animal id', 'tissue', 'tissue id', 'xna', 'fold change']
df_iv_adult = df_delta[df_delta['animal id']!='virus'][cols].reset_index(drop=True)

df_iv_adult['cats'] = df_iv_adult.apply(lambda x: (x['xna'], x['variant']), axis=1)

df_iv_mean = df_iv_adult.groupby(['tissue', 'variant', 'xna']).agg(mean=('fold change', 'mean'), std=('fold change', 'std'), n=('fold change', 'count'))
df_iv_mean['mean fold change'] = df_iv_mean['mean']
df_iv_mean['upper'] = df_iv_mean['mean'] + df_iv_mean['std'] / np.sqrt(df_iv_mean['n'])
df_iv_mean['lower'] = df_iv_mean['mean'] - df_iv_mean['std'] / np.sqrt(df_iv_mean['n'])

df_plot = pd.merge(df_iv_adult, df_iv_mean[['mean fold change', 'upper', 'lower']], on=['tissue', 'variant', 'xna'])
df_plot.head()

Unnamed: 0,variant,animal id,tissue,tissue id,xna,fold change,cats,mean fold change,upper,lower
0,AAV9,RMN-001,brain,RMN-001_b,DNA,0.805147,"(DNA, AAV9)",1.0,1.183425,0.816575
1,AAV9,RMN-001,brain,RMN-001_dcortex,DNA,0.565092,"(DNA, AAV9)",1.0,1.183425,0.816575
2,AAV9,RMN-001,brain,RMN-001_dcaudate,DNA,0.586254,"(DNA, AAV9)",1.0,1.183425,0.816575
3,AAV9,RMN-001,brain,RMN-001_dtemporal,DNA,0.52256,"(DNA, AAV9)",1.0,1.183425,0.816575
4,AAV9,RMN-001,brain,RMN-001_dputamen,DNA,0.470458,"(DNA, AAV9)",1.0,1.183425,0.816575


In [17]:
source_error = bokeh.models.ColumnDataSource(data=df_plot)
x_range_brain = list(df_plot['variant'].unique())

animal_id = ["RMN-002", "RMN-001"]
marker_colors = list(bokeh.palettes.Colorblind[8][6:])
markers = ['diamond', 'plus']

############### PLOT SETTINGS ###################
figure_width = 1000
figure_height = 300
width = 0.75
bar_fill_color='gray'
dodge = width/2

marker_size = 10
error_size=4
error_line_width=0.5
############### PLOT SETTINGS ###################

p = bokeh.plotting.figure(x_range=x_range_brain, height=figure_height, width=figure_width, title="In vivo adult rhesus macaque pool DNA in brain (intravenous delivery)")

############### PLOT SETTINGS ###################
p.xgrid.visible=False
p.axis.minor_tick_line_width=0
p.xaxis.major_label_orientation=45
p.axis.major_tick_in = 0
p.axis.major_label_text_color = "#000000"
p.axis.axis_label_text_align = "right"
p.add_layout(bokeh.models.Legend(), "right")
############### PLOT SETTINGS ###################


w = bokeh.models.Whisker(source=source_error, base='variant', upper='upper', lower='lower', level='overlay', line_width=error_line_width)

w.upper_head.line_width=error_line_width
w.upper_head.size=error_size
w.lower_head.line_width=error_line_width
w.lower_head.size=error_size

p.vbar(source=df_plot, x='variant', top='mean fold change', 
       width=width, line_color='black', fill_color=bar_fill_color)

p.scatter(source=df_plot, x=bokeh.transform.jitter('variant', width=dodge, range=p.x_range), y='fold change', 
          marker=bokeh.transform.factor_mark('animal id', markers, animal_id), size=marker_size, legend_group="animal id",
          color=bokeh.transform.factor_cmap('animal id', marker_colors, animal_id))

p.add_layout(w)

y_max = np.max([df_plot['upper'], df_plot['fold change']])
y_upper = np.max(np.max([df_plot["upper"], df_plot["fold change"]]))*1.1

p.y_range = bokeh.models.Range1d(0, y_upper)
bokeh.io.show(p)

### Statistics for in vivo adult rhesus macaque pool DNA in brain

In [15]:
p_val_adult = posthoc.posthoc_tamhane(a=df_plot, val_col="fold change", group_col="variant")

print("n = %i samples across %i animals. " % (len(df_plot["tissue id"].unique()), len(df_plot["animal id"].unique())))
print("AAV9 vs. AAV.CAP-Mac: P = %.3e." %(p_val_adult.loc["AAV9", "AAV.CAP-Mac"]))
print("AAV9 vs. AAV.CAP-C2: P = %.5f." %(p_val_adult.loc["AAV9", "AAV.CAP-C2"]))

n = 12 samples across 2 animals. 
AAV9 vs. AAV.CAP-Mac: P = 2.740e-08.
AAV9 vs. AAV.CAP-C2: P = 0.97500.
