In [None]:
import base
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rushd as rd
import scipy as sp
import seaborn as sns

# enables concurrent editing of base.py
from importlib import reload
reload(base)

### Load data

Two-gene data (`data`)

In [None]:
base_path = rd.datadir/'instruments'/'data'/'attune'/'kasey'
exp93_path = base_path/'2024.04.14_exp93'/'export'

plates = pd.DataFrame({
    'data_path': [exp93_path/'plate1', exp93_path/'plate2', exp93_path/'plate3',],
    'yaml_path': [exp93_path/'exp93_wells.yaml']*3,
    'biorep': [1, 2, 3],
    'exp': ['exp93']*3,
})

cache_path = rd.rootdir/'output'/'fig_architecture'/'data.gzip'

# Load data
data = pd.DataFrame()
if cache_path.is_file(): data = pd.read_parquet(cache_path)
else: 
    channel_list = ['mRuby2-A','FSC-A','SSC-A','tagBFP-A','mGL-A','SNAP-647-A']
    data = rd.flow.load_groups_with_metadata(plates, columns=channel_list)

    # Remove negative channel values
    for c in channel_list: data = data[data[c]>0]
    
    data.dropna(inplace=True)
    data.to_parquet(rd.outfile(cache_path))

In [None]:
# Add metadata for constructs
metadata_path = rd.datadir/'projects'/'miR-iFFL'/'plasmids'/'construct-metadata.xlsx'
metadata = base.get_metadata(metadata_path)
data = data.merge(metadata, how='left', on='construct')
metadata_construct2 = pd.read_excel(rd.datadir/'projects'/'miR-iFFL'/'plasmids'/'construct2-metadata.xlsx')
data = data.merge(metadata_construct2, how='left', on='construct2')
data['condition'] = data['construct'] + '_' + data['construct2']

# Rename far-red channel
data.rename(columns={'SNAP-647-A': 'iRFP-A'}, inplace=True)
display(data)

In [None]:
# Gate cells
gates = pd.DataFrame()
channel_list = ['mGL-A', 'mRuby2-A']
for channel in channel_list:
    gates[channel] = data[data['construct']=='GEEC555'].groupby(['exp'])[channel].apply(lambda x: x.quantile(0.999))
gates.reset_index(inplace=True)

# Add missing iRFP gate
gate_iRFP = 2.5e2
gates['iRFP-A'] = [gate_iRFP]

# Indicate which channels are relevant for each experiment
gates.sort_values(['exp'], inplace=True)
gates['marker'] = 'iRFP-A'
gates['output'] = 'mRuby2-A'

# Gate data by transfection marker expression
data = data.groupby('exp')[data.columns].apply(lambda x: base.gate_data(x,gates))
data.reset_index(inplace=True, drop=True)
df = data[(data['expressing']) & (data['construct']!='UT')]

In [None]:
# Bin data and calculate statistics
by = ['condition','construct','construct2','biorep','exp']
df_quantiles, stats, _, fits = base.calculate_bins_stats(df, by=by)
df_quantiles.sort_values(['design','ts_kind','ts_num'], inplace=True)

stats = stats.merge(metadata, how='left', on='construct')
stats.sort_values(['design','ts_kind','ts_num'], inplace=True)
stats = stats.merge(metadata_construct2, how='left', on='construct2')

fits = fits.merge(metadata, how='left', on='construct')
fits.sort_values(['design','ts_kind','ts_num'], inplace=True)
fits = fits.merge(metadata_construct2, how='left', on='construct2')


In [None]:
# output range of 5-95 percentile
p_5 = df.groupby(by)[['output']].apply(lambda x: np.percentile(x, 5)).rename('output_5th').reset_index()
p_95 = df.groupby(by)[['output']].apply(lambda x: np.percentile(x, 95)).rename('output_95th').reset_index()
stats = stats.merge(p_5, how='left')
stats = stats.merge(p_95, how='left')
stats['output_range'] = stats['output_95th'] - stats['output_5th']
stats['output_range_log'] = stats['output_95th'].apply(np.log10) - stats['output_5th'].apply(np.log10)

# fraction within 1 order of magnitude (10x) around median
def get_high_low(df):
    median = df['output'].median()
    return df.loc[(df['output']>(median * 10**(-0.5))) & (df['output']<(median * 10**0.5)), 'output'].count() / df['output'].count()

fraction = df.groupby(by)[df.columns].apply(get_high_low).rename('fraction_within_10x').reset_index()
stats = stats.merge(fraction, how='left')

In [None]:
metadata_comb = data.drop_duplicates('condition')[['construct','construct2','condition']]
metadata_comb = metadata_comb.merge(metadata, how='left', on='construct')
metadata_comb = metadata_comb.merge(metadata_construct2, how='left', on='construct2')

# Create color palette by architecture
metadata_comb.loc[metadata_comb['gene']=='1T', 'color'] = base.colors['teal']
metadata_comb.loc[metadata_comb['gene']=='2T', 'color'] = base.colors['green']
metadata_comb.loc[metadata_comb['gene']=='2V', 'color'] = base.colors['purple']

# markers
metadata_comb['markers'] = 'X'
metadata_comb.loc[metadata_comb['gene']=='1T', 'markers'] = 'o'
metadata_comb.loc[metadata_comb['gene']=='2T', 'markers'] = 'D'
metadata_comb.loc[metadata_comb['gene']=='2V', 'markers'] = 's'

gene_label = {'na': 'base', 'NT': 'OL', 'T': 'CL', 'none': '–'}
metadata_comb['ts_label'] = metadata_comb['ts_kind'].replace(gene_label)

metadata_dict = metadata_comb.set_index('gene').to_dict('dict')
gene_palette = metadata_dict['color']
gene_markers = metadata_dict['markers']

metadata_comb['kind'] = metadata_comb['gene'] + '_' + metadata_comb['design'].astype(str)
metadata_dict = metadata_comb.set_index('kind').to_dict('dict')
kind_palette = metadata_dict['color']
kind_markers = metadata_dict['markers']

# Create color palette by condition
metadata_comb.loc[(metadata_comb['gene']=='2V') & (metadata_comb['construct2_promoter']=='U6'), 'color'] = base.colors['blue']
metadata_comb.loc[(metadata_comb['ts_kind']=='NT'), 'color'] = base.colors['gray']
metadata_comb.loc[metadata_comb['group'].isin(['base','marker']), 'color'] = 'black'
metadata_comb['condition'] = metadata_comb['construct'] + '_' + metadata_comb['construct2']
metadata_dict = metadata_comb.set_index('condition').to_dict('dict')
condition_palette = metadata_dict['color']
condition_markers = metadata_dict['markers']

architecture_order = ['1T', '2T', '2V']

In [None]:
display(kind_markers)

Load stochastic simulations (`data_sim`)

In [None]:
cache_path = rd.datadir/'projects'/'miR-iFFL'/'modeling'/'julia_stochastic_simulations'/'stochastic_sims.gzip'
data_sim = pd.read_parquet(cache_path)

# Rename
data_sim.rename(columns={'copynum': 'copy_num', 'reg_gene': 'output', 'unreg_gene': 'marker'}, inplace=True)
data_sim['gene'] = data_sim['design'].map({'Design 1': '1T', 'Design 2': '1T', 'Design 3': '1T',
                                           'Dual Vector': '2V', 'Dual Transcript': '2T'})
data_sim['design'] = data_sim['design'].map({'Design 1': 1, 'Design 2': 2, 'Design 3': 3,
                                           'Dual Vector': 0, 'Dual Transcript': 0})
display(data_sim)

In [None]:
# Bin data and calculate statistics
by = ['design','moi','risc','gene']
_, stats_sim, _, fits_sim = base.calculate_bins_stats(data_sim, by=by)

data_sim['kind'] = data_sim['gene'] + '_' + data_sim['design'].astype(str)
stats_sim['kind'] = stats_sim['gene'] + '_' + stats_sim['design'].astype(str)
fits_sim['kind'] = fits_sim['gene'] + '_' + fits_sim['design'].astype(str)

### Set up figure

In [None]:
base_size = base.font_sizes['base_size']
smaller_size = base.font_sizes['smaller_size']

sns.set_style('ticks')
sns.set_context('paper', font_scale=1.0, rc={'font.size': base_size, 'font.family': 'sans-serif', 'font.sans-serif':['Arial']})
plt.rcParams.update({'axes.titlesize': base_size, 'axes.labelsize': base_size, 'xtick.labelsize': smaller_size, 'ytick.labelsize': smaller_size,
                     'pdf.fonttype': 42, 
                     'ytick.major.size': 3, 'xtick.major.size': 3, 'ytick.minor.size': 2, 'ytick.major.pad': 2, 'xtick.major.pad': 2, 
                     'lines.linewidth': 1,
                     'axes.spines.right': False, 'axes.spines.top': False, 'axes.labelpad': 2})

In [None]:
# Create the overall figure, gridspec, and add subfigure labels
fig = plt.figure(figsize=(6.8504,5.375))
fig_gridspec = matplotlib.gridspec.GridSpec(2, 4, figure=fig,
    wspace=0.4, hspace=0.4, height_ratios=[2,3.25], width_ratios=[0.27,0.23,0.1,0.4])
subfigures = {
    'A': fig.add_subfigure(fig_gridspec[0,0]),
    'B': fig.add_subfigure(fig_gridspec[0,3:]),
    'C': fig.add_subfigure(fig_gridspec[1,:2]),
    'D': fig.add_subfigure(fig_gridspec[1,2:]),
}
for label, subfig in subfigures.items():
    subfig.add_artist(matplotlib.text.Text(x=0, y=1, text=f'{label}', fontsize=base.font_sizes['subpanel_label'], fontweight='bold', verticalalignment='top',transform=subfig.transSubfigure))

scatter_kwargs = dict(s=4, jitter=0.1, linewidth=0.5, edgecolor='white')

output_path = rd.rootdir/'output'/'fig_architecture'
fig_name = 'fig_architecture.pdf'
fig.savefig(rd.outfile(output_path/fig_name))
fig.savefig(rd.outfile(rd.datadir/'manuscripts'/'2024_miR-iFFL'/'figures'/'links'/fig_name))

In [None]:
# two-gene architectures with 3'UTR target sites
stats_subset = stats[((stats['gene']=='1T') & (stats['design']==1) & (stats['group']=='controller')) |
                ((stats['gene']=='2T') & (stats['group']=='dual') & (stats['ts_loc']=='3\'')) |
                ((stats['gene']=='2V') & (stats['group']=='ts3') & (stats['construct2_promoter']=='EF1a')) |
                ((stats['group']=='base') & (stats['construct2_promoter']!='U6'))].copy()
stats_subset.sort_values(['gene','construct2_promoter','group','ts_kind'], inplace=True)
fits_subset = fits[((fits['gene']=='1T') & (fits['design']==1) & (fits['group']=='controller')) |
                ((fits['gene']=='2T') & (fits['group']=='dual') & (fits['ts_loc']=='3\'')) |
                ((fits['gene']=='2V') & (fits['group']=='ts3') & (fits['construct2_promoter']=='EF1a')) |
                ((fits['group']=='base') & (fits['construct2_promoter']!='U6'))].copy()
fits_subset.sort_values(['gene','construct2_promoter','group','ts_kind'], inplace=True)
df_quantiles_subset = df_quantiles[(((df_quantiles['gene']=='1T') & (df_quantiles['design']==1) & (df_quantiles['group']=='controller')) |
                        ((df_quantiles['gene']=='2T') & (df_quantiles['group']=='dual') & (df_quantiles['ts_loc']=='3\'')) |
                        ((df_quantiles['gene']=='2V') & (df_quantiles['group']=='ts3') & (df_quantiles['construct2_promoter']=='EF1a')) |
                        ((df_quantiles['group']=='base') & (df_quantiles['construct2_promoter']!='U6')))].copy()
df_quantiles_subset.sort_values(['gene','construct2_promoter'], inplace=True)

# shift xticks to add more space between architecture groups
buffer = 0.6
gene_order = ['1T', '2T', '2V']
xtick_locs = [0,1,2, 3+buffer,4+buffer, 5+buffer*2,6+buffer*2,7+buffer*2,]

condition_loc = {k:v for k,v in zip(stats_subset['condition'].unique(), xtick_locs)}
stats_subset['condition_loc'] = stats_subset['condition'].replace(condition_loc)

condition_loc = {k:v for k,v in zip(fits_subset['condition'].unique(), xtick_locs)}
fits_subset['condition_loc'] = fits_subset['condition'].replace(condition_loc)

metadata_comb['condition_loc'] = metadata_comb['condition'].map(condition_loc)
m = metadata_comb.dropna()
m['condition_loc'] = m['condition_loc'].astype(str)

xlim_adjusted = (-0.5, len(stats_subset['condition'].unique())-0.5+buffer*(len(gene_order)-1))
scatter_kwargs2 = dict(s=4, jitter=0.1, linewidth=0.5, edgecolor='white', native_scale=True)

In [None]:
subfig = subfigures['B']
rd.plot.adjust_subplot_margins_inches(subfig, left=0.35, bottom=0.45, top=0.35, right=0.1)
axes = subfig.subplots(1,2, gridspec_kw=dict(width_ratios=(1,1), wspace=0.3))

plot_df = stats_subset
plot_df2 = fits_subset

# stat gmean
ax = axes[0]
for construct, group in plot_df.groupby('condition', sort=False):
    sns.stripplot(data=group, x='condition_loc', y='output_gmean', hue='condition', palette=condition_palette,
                    legend=False, ax=ax, marker=condition_markers[construct], **scatter_kwargs2)
ax.set(title='Output mean', xlim=xlim_adjusted, xlabel='', ylabel='', yscale='log', xticks=xtick_locs)
marker_baseline = stats.loc[(stats['group']=='marker'), 'output_gmean'].mean()
ax.axhline(marker_baseline, color='black', ls=':')

# slope
ax = axes[1]
for construct, group in plot_df2.groupby('condition', sort=False):
    sns.stripplot(data=group, x='condition_loc', y='slope', hue='condition', palette=condition_palette,
                    legend=False, ax=ax, marker=condition_markers[construct], **scatter_kwargs2)
ax.set(title='Slope', xlim=xlim_adjusted, xlabel='', ylabel='', xticks=xtick_locs,)

for i,ax in enumerate(axes):
    yloc = -6
    ax.axvspan(2.5+buffer/2, 4.5+buffer*1.5, color=base.get_light_color(base.colors['gray']), alpha=0.2,)
    ax.annotate(architecture_order[0], (1,yloc), xycoords=('data','axes points'), ha='center', va='top', ma='center', fontsize=smaller_size)
    ax.annotate(architecture_order[1], (3.5+buffer,yloc), xycoords=('data','axes points'), ha='center', va='top', ma='center', fontsize=smaller_size)
    ax.annotate(architecture_order[2], (6+buffer*2,yloc), xycoords=('data','axes points'), ha='center', va='top', ma='center', fontsize=smaller_size)
    ax.set_xticklabels(['']*len(ax.get_xticklabels()))
    ax.set_xlabel(ax.get_xlabel(), labelpad=10)
    sns.despine(ax=ax)

fig.savefig(rd.outfile(output_path/fig_name))
fig.savefig(rd.outfile(rd.datadir/'manuscripts'/'2024_miR-iFFL'/'figures'/'links'/fig_name))

In [None]:
subfig = subfigures['D']
rd.plot.adjust_subplot_margins_inches(subfig, left=1, bottom=0.3, top=0.3, right=0.1)
axes = subfig.subplots(3,2, gridspec_kw=dict(width_ratios=(0.8,1), wspace=0.2, hspace=0.4))

plot_df = data_sim[(data_sim['risc']==10000) & (data_sim['design']<=1)]
yticks = [[0,1e3,2e3], [0,2e3,4e3,6e3], [0,2e3,4e3,6e3]]
hist_xticks = [[0,300,600,900,1200,1500], [0,1e3,2e3,3e3], [0,1e3,2e3,3e3,4e3,5e3]]

for i,moi in enumerate(plot_df['moi'].unique()):

    # line plot
    ax = axes[i,0]

    # drop data with only one point at a given copy number (for this plot)
    d = plot_df[(plot_df['moi']==moi)]
    if moi==3: d = d[(d['copy_num']<=10) & (d['gene']=='2V') | (d['gene']!='2V')]
    
    sns.lineplot(data=d[d['gene']!='1T'], x='copy_num', y='output', hue='kind', palette=kind_palette, 
                 legend=False, dashes=False, style='kind', markers=kind_markers, ax=ax, markersize=4, markeredgewidth=0.5,
                 estimator=sp.stats.gmean, errorbar=lambda x: (sp.stats.gmean(x) / sp.stats.gstd(x), sp.stats.gmean(x) * sp.stats.gstd(x)))
    sns.lineplot(data=d[d['gene']=='1T'], x='copy_num', y='output', hue='kind', palette=kind_palette, 
                 legend=False, dashes=False, style='kind', markers=kind_markers, ax=ax, markersize=4, markeredgewidth=0.5,
                 estimator=sp.stats.gmean, errorbar=lambda x: (sp.stats.gmean(x) / sp.stats.gstd(x), sp.stats.gmean(x) * sp.stats.gstd(x)), 
                 ls='--')
    sns.despine(ax=ax)
    ax.minorticks_off()
    ax.set(xlabel='', yticks=yticks[i], yticklabels=[f'{float(l)/1000:.0f}k' for l in yticks[i]],)
    
    if i==2: ax.set(xlabel='copy #')

    # histogram
    ax = axes[i,1]
    sns.kdeplot(data=d[d['gene']!='1T'], x='output', hue='kind', palette=kind_palette, ax=ax, legend=False, common_norm=False,
                bw_adjust=3)
    sns.kdeplot(data=d[d['gene']=='1T'], x='output', hue='kind', palette=kind_palette, ax=ax, legend=False, common_norm=False,
                bw_adjust=3, ls='--')
    sns.despine(ax=ax, left=True)
    
    ax.set(xlabel='', xlim=(hist_xticks[i][0], hist_xticks[i][-1]), xticks=hist_xticks[i], ylabel='', yticklabels=[],)
    if i>0: ax.set(xticklabels=[f'{float(l.get_text())/1000:.0f}' for l in ax.get_xticklabels()])
    else: ax.set(xticklabels=[f'{float(l.get_text())/1000:.1f}' for l in ax.get_xticklabels()])
    xticklabels_k = ax.get_xticklabels()
    xticklabels_k[-1] = xticklabels_k[-1].get_text()+'k'
    ax.set(xticklabels=xticklabels_k)
    ax.get_yaxis().set_visible(False)
    if i==2: ax.set(xlabel='protein')

for ax in axes.flatten(): ax.minorticks_off()
    
fig.savefig(rd.outfile(output_path/fig_name))
fig.savefig(rd.outfile(rd.datadir/'manuscripts'/'2024_miR-iFFL'/'figures'/'links'/fig_name))