In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import h5py
import pandas as pd
import numpy as np

from pathlib import Path
import re

import itertools
sns.set_style('ticks')
sns.set_context('talk',rc={'font.family': 'sans-serif', 'font.sans-serif':['Helvetica Neue']})

In [None]:
main_condition_palette = {
    '1-gene': '#808080',
    '2-gene.Tandem': '#A58245',
    '3-gene.Tandem': '#F1BE64',
    '2-gene.Divergent': '#18B7CE',
    '3-gene.Divergent': '#1DE2FF',
    '2-gene.Convergent': '#AC4140',
    '3-gene.Convergent': '#F85D5C',
    'EGFP': '#29f43d',
    'mRuby2': '#f42d29',
}
light_background_palette = {k:v + '30' for k, v in main_condition_palette.items()}
plot_order = [
    '1-gene',
    '2-gene.Tandem', '3-gene.Tandem',
    '2-gene.Divergent', '3-gene.Divergent',
    '2-gene.Convergent', '3-gene.Convergent'
]

In [None]:
def dox_string_to_float(s):
    # Handle '0'
    if s == '0':
        return 0.0
    # Handle E-1.5 and friends
    if s.startswith('E'):
        return 10**float(s[1:])
    # Handle BASEeEXP (1e-1.5)
    e_split = s.lower().split('e')
    if len(e_split) == 2:
        return float(e_split[0]) * (10**float(e_split[1]))
    raise RuntimeError(f"Unable to convert dox string: '{s}' to float.")

### Initial data import and postprocessing

In [None]:
# Import our three datasets
n_gene_regex = re.compile(r'^293T\.(?P<condition>\d-gene(?:\.\w+)?)\.(?P<dox>.*)-dox\.(?P<rep>\d)-rep_Single Cells')
control_regex = re.compile(r'^293T\.(?P<condition>[^.]+)\.(?P<rep>\d)-rep_Single Cells')
cell_transfection_regex = re.compile(r'^export_(?P<celltype>[^.]+)\.(?P<condition>\dg\.\w+)\.(?P<dox>.*)dox\.r(?P<rep>\d)_Single Cells')
pb_regex = re.compile(r'^export_(?P<celltype>[^.]+)\.(?P<condition>(\d[gc]\.\w+|\w+))(.\d)?\.(?P<dox>.*)dox\.r(?P<rep>\d)_Single Cells')
first_transfection_dataset = []
control_dataset = []
cell_transfection_dataset = []
pb_dataset = []
for file in Path('../../output/flow/2021.05.07-T2.0/').glob('*.csv'):
    exp_match = n_gene_regex.match(file.stem)
    if exp_match is not None:
        first_transfection_dataset.append(pd.read_csv(file))
        first_transfection_dataset[-1]['celltype'] = '293T'
        first_transfection_dataset[-1]['condition'] = exp_match.group('condition')
        dox_str = exp_match.group('dox')
        first_transfection_dataset[-1]['dox'] = dox_string_to_float(dox_str)
        first_transfection_dataset[-1]['replicate'] = int(exp_match.group('rep'))
for file in Path('../../output/flow/2021.05.07-T2.0/').glob('*.csv'):
    control_match = control_regex.match(file.stem)
    if control_match is not None:
        control_dataset.append(pd.read_csv(file))
        control_dataset[-1]['celltype'] = '293T'
        control_dataset[-1]['condition'] = control_match.group('condition')

for file in Path('../../output/flow/2021.10.18-transfection/export/').glob('*.csv'):
    match = cell_transfection_regex.match(file.stem)
    if match is not None:
        dataset = pd.read_csv(file)
        if dataset['mRuby2-A'].dtype != np.int64:
            continue
        cell_transfection_dataset.append(dataset)
        cell_transfection_dataset[-1]['celltype'] = match.group('celltype')
        cell_transfection_dataset[-1]['condition'] = match.group('condition').replace('g.','-gene.')
        cell_transfection_dataset[-1]['dox'] = dox_string_to_float(match.group('dox'))
        cell_transfection_dataset[-1]['replicate'] = int(match.group('rep'))
for file in Path('../../output/flow/2021.11.07-flow_tangles_integration/').glob('*.csv'):
    match = pb_regex.match(file.stem)
    if match is not None:
        pb_dataset.append(pd.read_csv(file))
        pb_dataset[-1]['celltype'] = match.group('celltype')
        # Fixup condition
        condition = match.group('condition')
        if condition[1] == 'c':
            condition = condition[0] + 'g' + condition[2:]
        pb_dataset[-1]['condition'] = condition.replace('g.','-gene.')
        pb_dataset[-1]['dox'] = dox_string_to_float(match.group('dox'))
        pb_dataset[-1]['replicate'] = int(match.group('rep'))
first_transfection_dataset = pd.concat(first_transfection_dataset, ignore_index=True)
control_dataset = pd.concat(control_dataset, ignore_index=True)
cell_transfection_dataset = pd.concat(cell_transfection_dataset, ignore_index=True)
pb_dataset = pd.concat(pb_dataset, ignore_index=True)

In [None]:
# Postprocess
mRuby2_control_intensity = sorted(control_dataset[(control_dataset.condition == 'iRFP670') & (control_dataset['iRFP670-A'] > 2e2)]['mRuby2-A'])
mRuby2_gate = mRuby2_control_intensity[int(len(mRuby2_control_intensity) * 0.95)]
gated_transfection = cell_transfection_dataset[
    (cell_transfection_dataset['iRFP670-A'] > 2e2) &
    (cell_transfection_dataset['mRuby2-A'] > mRuby2_gate)].copy()
gated_initial_dataset = first_transfection_dataset[first_transfection_dataset['iRFP670-A'] > 2e2].copy()
gated_pb = pb_dataset[pb_dataset['iRFP670-A'] > 2e2].copy()

In [None]:
gated_initial_dataset.condition.unique()

### PB integration efficiency analysis

In [None]:
sns.set_style('ticks')
fig, axes = plt.subplots(2,1,sharex=True, figsize=(8,8))
pb_order = [
    '3-gene.Tandem',
    '2-gene.Convergent', '3-gene.Convergent',
    '3-gene.Divergent',
    'EGFP','mRuby2'
]
pb_int_dataset = pb_dataset.copy()
pb_int_dataset['Integration Marker'] = pb_int_dataset['iRFP670-A']
pb_int_dataset.loc[pb_int_dataset['condition']=='EGFP','Integration Marker'] = pb_int_dataset[pb_int_dataset['condition']=='EGFP']['eGFP-A']
pb_int_dataset.loc[pb_int_dataset['condition']=='mRuby2','Integration Marker'] = pb_int_dataset[pb_int_dataset['condition']=='mRuby2']['mRuby2-A']
pb_int_dataset = pb_int_dataset[pb_int_dataset['Integration Marker'] > 0].copy()
pb_int_dataset['log Integration Marker'] = np.log10(pb_int_dataset['Integration Marker'])
integration_efficiency = pb_int_dataset.groupby(['condition','replicate']).aggregate(lambda x: np.mean(x > 2e2)).reset_index()
integration_efficiency['efficiency'] = integration_efficiency['Integration Marker']
sns.stripplot(x='condition',y='efficiency',order=pb_order,
    data=integration_efficiency, palette=main_condition_palette, ax=axes[0])
sns.despine()
sns.violinplot(x='condition', y='log Integration Marker', order=pb_order, palette=main_condition_palette,
    data=pb_int_dataset, ax=axes[1])
axes[1].plot([-0.35,5.35], [np.log10(2e2),np.log10(2e2)], 'k')
sns.despine()
axes[0].set_xlabel('')
axes[1].tick_params(rotation=90)
axes[0].set_yticks(np.arange(0,0.30,0.05))
axes[1].set_yticks(np.arange(0,6,1))
axes[0].set_ylabel('Integration Efficiency')
fig.align_ylabels(axes)
plt.savefig('../../img/t2.0/piggybac_integration.svg', bbox_inches='tight')
plt.show()

### Trend analysis: celltypes

In [None]:
plot_df = gated_transfection[(gated_transfection['mRuby2-A'] > 1) & (gated_transfection['eGFP-A'] > 1)].copy()
filter_set = plot_df['dox'].unique()[1::2]
plot_df = plot_df[plot_df['dox'].apply(lambda x: x==0 or x in filter_set)]
# Filter out the EGFP/mRuby2 conditions
plot_df = plot_df[plot_df['condition'].apply(lambda x: x[1] == '-')]
plot_df['log mRuby2-A'] = np.log10(plot_df['mRuby2-A'])
plot_df['log eGFP-A'] = np.log10(plot_df['eGFP-A'])
subsetted_order = [x for x in plot_order if x in plot_df['condition'].unique()]

sns.set_style('ticks')
grid = sns.FacetGrid(plot_df, row='condition', col='celltype',
        hue='dox', palette=matplotlib.cm.get_cmap('viridis',6)(range(6)),
        row_order=subsetted_order, margin_titles=True, despine=False, aspect=3, height=1.8)
grid.map(sns.kdeplot, 'log mRuby2-A', linewidth=3.5)
grid.fig.subplots_adjust(wspace=0, hspace=0)
for (row_val, col_val), ax in grid.axes_dict.items():
    ax.set_facecolor(light_background_palette[row_val])
    ax.set_yticks([0.0, 0.5])
# Adjustment from https://cduvallet.github.io/posts/2018/11/facetgrid-ylabel-access
for axis in grid.axes.flat:
    axis.set_ylabel('Density')
    if axis.texts:
        txt = axis.texts[0]
        axis.text(*txt.get_unitless_position(),
                txt.get_text().split('=')[1].strip(),
                transform=axis.transAxes,
                va='center')
        axis.texts[0].remove()
plt.legend(bbox_to_anchor=(-1, 2))
for text, s in zip(plt.legend().get_texts(), ['Uninduced', '10^-3.5 dox', '10^-2.5 dox', '10^-1.5 dox', '10^-0.5 dox']):
    text.set_text(s)
plt.savefig('../../img/t2.0/celltype_tangles_mRuby2_response.svg', bbox_inches='tight')
plt.show()

In [None]:
sns.set_style('ticks')
sns.set_context('paper')
plot_df['recentered_mRuby2'] = plot_df['log mRuby2-A'] - 2
plot_df['recentered_EGFP'] = plot_df['log eGFP-A'] - 1.2
g = sns.jointplot(data=plot_df[(plot_df.condition == '3-gene.Divergent') & ((plot_df.dox == 0.0) | (plot_df.dox == np.max(plot_df.dox)))],
              x='recentered_EGFP', y='recentered_mRuby2', hue='dox', fill=True, kind='kde', alpha=0.5, legend=None,
              palette = {0.0:'#555555', np.max(plot_df.dox):'#25ada3'})
g.ax_marg_x.remove()
g.ax_joint.set_xlabel('Exp. log EGFP')
g.ax_joint.set_ylabel('Exp. log mRuby2')
plt.gcf().set_figwidth(2)
plt.gcf().set_figheight(1)
sns.despine()
plt.savefig('../../img/experimental_divergent_transfection.svg', bbox_inches='tight')
plt.show()

### Trend analysis: PiggyBac

In [None]:
plot_df = gated_pb[(gated_pb['mRuby2-A'] > 1) & (gated_pb['eGFP-A'] > 1)].copy()
filter_set = plot_df['dox'].unique()[1::2]
plot_df = plot_df[plot_df['dox'].apply(lambda x: x==0 or x in filter_set)]
# Filter out the EGFP/mRuby2 conditions
plot_df = plot_df[plot_df['condition'].apply(lambda x: x[1] == '-')]
plot_df['log mRuby2-A'] = np.log10(plot_df['mRuby2-A'])
plot_df['log eGFP-A'] = np.log10(plot_df['eGFP-A'])
subsetted_order = [x for x in plot_order if x in plot_df['condition'].unique()]

sns.set_style('ticks')
grid = sns.FacetGrid(plot_df, row='condition',
        hue='dox', palette=matplotlib.cm.get_cmap('viridis',6)(range(6)),
        row_order=subsetted_order, margin_titles=True, despine=False, aspect=3, height=1.8)
grid.map(sns.kdeplot, 'log mRuby2-A', linewidth=3.5)
grid.fig.subplots_adjust(wspace=0, hspace=0)
for row_val, ax in grid.axes_dict.items():
    ax.set_facecolor(light_background_palette[row_val])
# Adjustment from https://cduvallet.github.io/posts/2018/11/facetgrid-ylabel-access
for axis in grid.axes.flat:
    axis.set_ylabel('Density')
    if axis.texts:
        txt = axis.texts[0]
        axis.text(*txt.get_unitless_position(),
                txt.get_text().split('=')[1].strip(),
                transform=axis.transAxes,
                va='center')
        axis.texts[0].remove()
plt.legend(bbox_to_anchor=(-1, 2))
for text, s in zip(plt.legend().get_texts(), ['Uninduced', '10^-3.5 dox', '10^-2.5 dox', '10^-1.5 dox', '10^-0.5 dox']):
    text.set_text(s)
plt.savefig('../../img/t2.0/pb_integration_mRuby2_response.svg', bbox_inches='tight')
plt.show()
sns.set_style('whitegrid')
for log_str, normed_str, xlim, ylim in [
        ('log ', '-A', None, None)
        ]:
    kwargs = {}
    if xlim is not None:
        kwargs['xlim'] = xlim
    if ylim is not None:
        kwargs['ylim'] = ylim
    grid = sns.FacetGrid(plot_df, col='dox', row='condition',
            row_order=subsetted_order,
            margin_titles=True, despine=False, **kwargs)
    grid.map(sns.scatterplot,
        log_str + 'eGFP' + normed_str,
        log_str + 'mRuby2' + normed_str, color='#303030', alpha=0.4, s=10)
    grid.fig.subplots_adjust(wspace=0, hspace=0)
    for axis in grid.axes[0]:
        dox = float(axis.get_title().split('=')[1].strip())
        if dox == 0:
            axis.set_title('Uninduced')
        else:
            axis.set_title('10^{} induction'.format(np.log10(dox)))
    # Adjustment from https://cduvallet.github.io/posts/2018/11/facetgrid-ylabel-access
    for axis in grid.axes.flat:
        if axis.texts:
            txt = axis.texts[0]
            axis.text(*txt.get_unitless_position(),
                    txt.get_text().split('=')[1].strip(),
                    transform=axis.transAxes,
                    va='center')
            axis.texts[0].remove()

    for (row_val,col_val), ax in grid.axes_dict.items():
        ax.set_facecolor(light_background_palette[row_val])
    plt.savefig('../../img/t2.0/pb_integration_dotplot.png', bbox_inches='tight', dpi=300)
    plt.show()

### Resource competition
Plot the behavior of the iRFP signal as a function of induction level in the integrated and non-integrated cases.

In [None]:
sns.set_style("ticks")
combined_resource_competition = [gated_transfection.copy(), gated_pb.copy()]
combined_resource_competition[0]['method'] = 'plasmid'
combined_resource_competition[1]['method'] = 'piggybac'
combined_resource_competition = pd.concat(combined_resource_competition, ignore_index=True)
combined_resource_competition = combined_resource_competition[combined_resource_competition['celltype']=='293T']
combined_resource_competition = combined_resource_competition[combined_resource_competition['condition'].apply(lambda x: x in pb_order[:-2])]
mean_data = combined_resource_competition.groupby(['method','condition','dox','replicate']).median()
rc_data = (mean_data['iRFP670-A'] - mean_data.xs(0.0, level='dox')['iRFP670-A']).reset_index()
rc_data['log dox'] = np.log10(rc_data['dox'] + .001)
grid = sns.FacetGrid(data=rc_data,row='method', col='condition', margin_titles=True, col_order=['3-gene.Tandem', '3-gene.Divergent', '2-gene.Convergent', '3-gene.Convergent'])
grid.map(sns.scatterplot,'log dox', 'iRFP670-A', color='#999999')
grid.fig.subplots_adjust(wspace=0.1, hspace=0.1)
for axis in grid.axes[0]:
    axis.set_title(axis.get_title().split('=')[1].strip())
for axis in grid.axes.flat:
    axis.plot([-3,-.2],[0,0], 'k')
    if axis.texts:
        txt = axis.texts[0]
        axis.text(*txt.get_unitless_position(),
                txt.get_text().split('=')[1].strip(),
                transform=axis.transAxes,
                va='center')
        axis.texts[0].remove()
plt.savefig('../img/t2.0/resource_competition.svg', bbox_inches='tight')
plt.show()

In [None]:
sns.set_style('ticks')
plot_df = [gated_transfection.copy(), gated_pb.copy()]
plot_df[0]['method'] = 'plasmid'
plot_df[1]['method'] = 'piggybac'
plot_df = pd.concat(plot_df, ignore_index=True)
plot_df = plot_df[plot_df['celltype']=='293T']
plot_df = plot_df[plot_df['condition'] == '3-gene.Divergent']
plot_df['log iRFP670-A'] = np.log10(plot_df['iRFP670-A'])
for with_dashes in (False, True):
    grid = sns.FacetGrid(data=plot_df, row='method', margin_titles=True, row_order=['plasmid', 'piggybac'], height=4, aspect=2.5)
    grid.map(sns.violinplot,'dox', 'log iRFP670-A', color='#999999')
    for axis, line_height in zip(grid.axes.flat, [4.15, 3.41]):
        if with_dashes:
            axis.plot([0,7],[line_height,line_height], '#a31f34')
            axis.plot([0,7],[line_height + 0.096, line_height + 0.096], '#a31f34')[0].set_dashes([5,5])
            axis.plot([0,7],[line_height - 0.12, line_height - 0.12], '#a31f34')[0].set_dashes([5,5])
            axis.set_xlim([-0.5, 7.5])
        if axis.texts:
            txt = axis.texts[0]
            axis.text(*txt.get_unitless_position(),
                    txt.get_text().split('=')[1].strip(),
                    transform=axis.transAxes,
                    va='center')
            axis.texts[0].remove()
    grid.axes.flat[1].set_xticklabels(['0', '10^-3.5', '10^-3.0', '10^-2.5', '10^-2.0', '10^-1.5', '10^-1.0', '10^-0.5'])
    plt.savefig(f'../img/t2.0/resource_competition_violins_{"dashes" if with_dashes else "clean"}.svg', bbox_inches='tight')
    plt.show()