In [1]:
import pandas as pd

import bokeh
from bokeh.models.widgets import Panel, Tabs
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.sampledata.commits import data
from bokeh.transform import jitter

import numpy as np
import scipy
from scipy import stats
from scipy import optimize

bokeh.io.output_notebook()

The purpose of this code is to generate the plots comparing the transgene expression for Alveolar Type II pneumocyte between serotypes and engineered variants.

We can start by importing the csv file containing the data. Note, AAV9.452sub.LUNG1 was previously named AAV.CAP-A4, and so this nomenclature is used throughout this file:

In [2]:
# THIS WILL NEED TO BE UPDATED TO THE FILE LOCATION IN YOUR PATH
fname = 'CAPA4_ATII_quant.csv'

# Load in the data
df = pd.read_csv(fname, comment = '#')

#Display the initial data
df.dropna()

Unnamed: 0,Virus,Animal,Section,Replicate,Positive,ATII
0,AAV5,1,1,1,1,1
1,AAV5,1,1,2,3,0
2,AAV5,1,1,3,0,0
3,AAV5,1,1,4,0,0
4,AAV5,1,2,1,0,0
...,...,...,...,...,...,...
139,AAV.CAP-A4,6,1,4,78,46
140,AAV.CAP-A4,6,2,1,70,18
141,AAV.CAP-A4,6,2,2,74,37
142,AAV.CAP-A4,6,2,3,65,21


We can start by grouping data by virus and animal, getting the mean of the number of ATII cells transduced. 

In [3]:
grouped = df.groupby(['Virus','Animal'])
df_ATII = grouped['ATII'].mean().reset_index()
df_ATII = df_ATII.dropna()

We can normalize the transduction value to the amount of transduction by AAV9.

In [4]:
inds = df_ATII['Virus'] == 'AAV9'

df_ATII['Normalized Transduction'] = df_ATII['ATII'] / np.mean(df_ATII.loc[inds, 'ATII'])

We can then group the data by virus and animal to combine replicates for each animal and compare them to each other.  

In [5]:
grouped = df_ATII.groupby(['Virus', 'Animal'])
df_ATII_grouped = grouped['Normalized Transduction'].mean().reset_index()

We can then plot the data.

In [7]:
# Store the data for plotting
source = ColumnDataSource(df_ATII_grouped)

# Create the catagories to plot from
catagories = list(df['Virus'].unique())

# Intitialize a figure
p = figure(plot_width=300, plot_height=300, x_range=catagories, y_axis_label = 'GFP+ / ATII+ normalized to AAV9')

# Plot the data as a scatter plot
p.circle(x=jitter('Virus', width=0.2, range=p.x_range), y='Normalized Transduction', source=source, alpha=1)

bar_list = [0.5, 1.5, 2.5]
mean_width = 0.25
whisker_width = 0.05

sample_size = 6

# Code to add mean and standard error of the mean lines

i = 0
for virus in ['AAV5', 'AAV9', 'AAV.CAP-A4']:
    inds = df_ATII_grouped['Virus'] == virus
    
    mean = np.mean(df_ATII_grouped.loc[inds,'Normalized Transduction'])
    std_dev = np.std(df_ATII_grouped.loc[inds,'Normalized Transduction'])
    std_eot_mean = np.std(df_ATII_grouped.loc[inds,'Normalized Transduction']) / np.sqrt(sample_size)
    
    p.hbar(mean, height = 0, right=(bar_list[i]-mean_width), left=(bar_list[i]+mean_width), color = 'black')
    p.hbar(std_eot_mean + mean, height = 0, right=(bar_list[i]-whisker_width), left=(bar_list[i]+whisker_width), color = 'black')
    p.hbar(mean - std_eot_mean, height = 0, right=(bar_list[i]-whisker_width), left=(bar_list[i]+whisker_width), color = 'black')
    p.segment(bar_list[i], mean, bar_list[i], std_eot_mean + mean, color = 'black')
    p.segment(bar_list[i], mean, bar_list[i], mean - std_eot_mean, color = 'black')
    i += 1

# Simply modify plot characteristics 
p.plot_height = 350
p.plot_width = 350

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

#p.xaxis.minor_tick_line_color = None
p.yaxis.minor_tick_line_color = None

p.xaxis.major_label_text_font_size = '12pt'
p.yaxis.major_label_text_font_size = '12pt'

p.xaxis.axis_label_text_font_size = '18pt'
p.yaxis.axis_label_text_font_size = '18pt'

p.background_fill_color = None 
p.border_fill_color = None

p.output_backend = "svg"

show(p)

We can define populations for comparison.

In [10]:
pop_AAV5 = df_ATII_grouped[df_ATII_grouped['Virus'] == 'AAV5']
pop_AAV9 = df_ATII_grouped[df_ATII_grouped['Virus'] == 'AAV9']
pop_CAPA4 = df_ATII_grouped[df_ATII_grouped['Virus'] == 'AAV.CAP-A4']

Next, we can compare those populations using a Welch's t-test:

In [11]:
stat_5_9 = stats.ttest_ind(pop_AAV5['Normalized Transduction'], pop_AAV9['Normalized Transduction'], equal_var = False)
stat_A4_5 = stats.ttest_ind(pop_CAPA4['Normalized Transduction'], pop_AAV5['Normalized Transduction'], equal_var = False)
stat_A4_9 = stats.ttest_ind(pop_CAPA4['Normalized Transduction'], pop_AAV9['Normalized Transduction'], equal_var = False)

print("P values between groups are:\n AAV5 and AAV9: %f \n AAV5 and CAPA4: %f \n AAV9 and CAPA4: %f" % (stat_5_9[1], stat_A4_5[1], stat_A4_9[1]))

P values between groups are:
 AAV5 and AAV9: 0.002491 
 AAV5 and CAPA4: 0.000738 
 AAV9 and CAPA4: 0.000813


We can group all the data together to determine the effect size between AAV9.452sub.LUNG1 (AAV.CAP-A4) and AAV9. 

In [15]:
grouped = df_ATII_grouped.groupby(['Virus'])

df_ATII_grouped = grouped['Normalized Transduction'].mean().reset_index()

df_ATII_grouped = df_ATII_grouped.dropna()

df_ATII_grouped

Unnamed: 0,Virus,Normalized Transduction
0,AAV.CAP-A4,32.628571
1,AAV5,0.342857
2,AAV9,1.0


We can group the replicates within the same animal and determine the fraction of cells expressing transgene that were positively stained as ATII cells. 

In [16]:
grouped = df.groupby(['Virus','Animal'])

# Average the replicates for each animal
df_positive = grouped['Positive'].sum().reset_index()
df_positive['ATII'] = grouped['ATII'].sum().reset_index()['ATII']

# Create a new column that is the number of ATII cells divided by the number of positive cells, fraction ATII positive 
df_positive['Fraction ATII Positive'] = df_positive['ATII']/df_positive['Positive']
df_positive = df_positive.dropna()

#Display the dataframe
df_positive

Unnamed: 0,Virus,Animal,Positive,ATII,Fraction ATII Positive
0,AAV.CAP-A4,1,540,256,0.474074
1,AAV.CAP-A4,2,499,182,0.364729
2,AAV.CAP-A4,3,438,205,0.468037
3,AAV.CAP-A4,4,392,145,0.369898
4,AAV.CAP-A4,5,1157,97,0.083838
5,AAV.CAP-A4,6,527,257,0.487666
6,AAV5,1,8,1,0.125
7,AAV5,2,8,2,0.25
8,AAV5,3,6,3,0.5
9,AAV5,4,7,0,0.0


Now we can plot the data:

In [18]:
# Store the data for plotting
source = ColumnDataSource(df_positive)

# Create the catagories to plot from
catagories = list(df['Virus'].unique())

# Intitialize a figure
p = figure(plot_width=300, plot_height=300, x_range=catagories, y_axis_label = 'Fraction of GFP+ co-localized with proSPC expression')

# Plot the data as a scatter plot
p.circle(x=jitter('Virus', width=0.2, range=p.x_range), y='Fraction ATII Positive', source=source, alpha=1)

bar_list = [0.5, 1.5, 2.5]
mean_width = 0.25
whisker_width = 0.05

sample_size = 6

# Code to add mean and standard error of the mean lines

i = 0
for virus in ['AAV5', 'AAV9', 'AAV.CAP-A4']:
    inds = df_positive['Virus'] == virus
    
    mean = np.mean(df_positive.loc[inds,'Fraction ATII Positive'])
    std_dev = np.std(df_positive.loc[inds,'Fraction ATII Positive'])
    std_eot_mean = np.std(df_positive.loc[inds,'Fraction ATII Positive']) / np.sqrt(sample_size)
    
    p.hbar(mean, height = 0, right=(bar_list[i]-mean_width), left=(bar_list[i]+mean_width), color = 'black')
    p.hbar(std_eot_mean + mean, height = 0, right=(bar_list[i]-whisker_width), left=(bar_list[i]+whisker_width), color = 'black')
    p.hbar(mean - std_eot_mean, height = 0, right=(bar_list[i]-whisker_width), left=(bar_list[i]+whisker_width), color = 'black')
    p.segment(bar_list[i], mean, bar_list[i], std_eot_mean + mean, color = 'black')
    p.segment(bar_list[i], mean, bar_list[i], mean - std_eot_mean, color = 'black')
    i += 1

# Simply modify plot characteristics 
p.plot_height = 350
p.plot_width = 350

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

#p.xaxis.minor_tick_line_color = None
p.yaxis.minor_tick_line_color = None

p.xaxis.major_label_text_font_size = '12pt'
p.yaxis.major_label_text_font_size = '12pt'

p.xaxis.axis_label_text_font_size = '18pt'
p.yaxis.axis_label_text_font_size = '18pt'

p.background_fill_color = None 
p.border_fill_color = None

p.output_backend = "svg"

show(p)

We can set populations to perform stats with:

In [19]:
pop_AAV5 = df_positive[df_positive['Virus'] == 'AAV5']
pop_AAV9 = df_positive[df_positive['Virus'] == 'AAV9']
pop_CAPA4 = df_positive[df_positive['Virus'] == 'AAV.CAP-A4']

Next, we can compare those populations using a Welch's t-test:

In [20]:
stat_5_9 = stats.ttest_ind(pop_AAV5['Fraction ATII Positive'], pop_AAV9['Fraction ATII Positive'], equal_var = False)
stat_A4_5 = stats.ttest_ind(pop_CAPA4['Fraction ATII Positive'], pop_AAV5['Fraction ATII Positive'], equal_var = False)
stat_A4_9 = stats.ttest_ind(pop_CAPA4['Fraction ATII Positive'], pop_AAV9['Fraction ATII Positive'], equal_var = False)

print("P values between groups are:\n AAV5 and AAV9: %f \n AAV5 and CAPA4: %f \n AAV9 and CAPA4: %f" % (stat_5_9[1], stat_A4_5[1], stat_A4_9[1]))

P values between groups are:
 AAV5 and AAV9: 0.859568 
 AAV5 and CAPA4: 0.471008 
 AAV9 and CAPA4: 0.400319


We can see none of these values are statistically significant (as suggested by the data plot). Therefore, the AAV9.452sub.LUNG1 (also called AAV.CAP-A4) variant doesn't seem to be any more specific to ATII cells despite higher transduction levels. 