# Set up

### Gate data in FlowJo

See `flow-tutorial.ipynb` a brief description. You should now have `.csv` files (one per sample/well) in Smithsonian.

### Set up repository

See `flow-tutorial.ipynb` for more information.

In [None]:
# Import all our favorite packages
import matplotlib       
import matplotlib.pyplot as plt
import numpy as np      
import pandas as pd     
import rushd as rd      
import scipy as sp      
import seaborn as sns   

In [None]:
# Path to example data
# call it 'base_path' because there are several plates (several folders of .csv files)
base_path = rd.datadir/'data'/'attune'/'kasey'/'2024.04.05_exp89'/'export'

# Path to directory to save analysis outputs
output_path = rd.rootdir/'output'/'flow-example'

### Create a `.yaml` file with well metadata 

See `flow-tutorial.ipynb` for more information.

Contents of one `.yaml` file for this experiment:

```yaml
metadata:
  construct:
    - RC124: A1-H1
    - RC125: A2-H2
    - RC126: A3-H3
    - RC127: A4-H4
    - RC128: A5-H5
    - RC129: A6-H6
    - RC130: A7-H7
    - RC131: A8-H8
    - RC132: A9-H9
    - RC133: A10-H10
    - RC134: A11-H11
    - RC135: A12-H12
  dox:
    - 0: A1-D12
    - 1000: E1-H12
  cell:
    - 293T: A1-H12
  biorep:
    - 1: A1-H12
  replicate:
    - 1: A1-A12, E1-E12
    - 2: B1-B12, F1-F12
    - 3: C1-C12, G1-G12
    - 4: D1-D12, H1-H12
```

In [None]:
# Visualize well metadata
yaml_path = base_path/'kasey_yaml'/'plate01.yaml'
rd.plot.plot_well_metadata(yaml_path)

# Load data

### Set up plates to load

In [None]:
# Create a DataFrame with plate information since we'll load multiple plates
plates = pd.DataFrame({
    'data_path': ['293T_control', '293T_plate1', '293T_plate2', '293T_plate3'],
    'yaml_path': ['kasey_yaml/plate_control.yaml'] + [f'kasey_yaml/plate0{i+1}.yaml' for i in range(3)],
    'plate': range(4),
})

display(plates)

In [None]:
# Confirm that the all the .yaml files look good
for p in plates['yaml_path'].unique():
    rd.plot.plot_well_metadata(base_path/p)

### Load using `rushd` and save to local cache

In [None]:
# Specify which channels to load
channel_list = ['mRuby2-A','mGL-A']

# Load data
data = rd.flow.load_groups_with_metadata(plates, base_path, columns=channel_list)

display(data)

In [None]:
# Path to local cache of data 
# (for faster loading later, not required)
cache_path = output_path/'data.gzip'

data = pd.DataFrame()

# If cache exists, load data from local cache
if cache_path.is_file(): 
    data = pd.read_parquet(cache_path)

# Otherwise, load from datadir and create cache
else: 
    channel_list = ['mRuby2-A','mGL-A']
    data = rd.flow.load_groups_with_metadata(plates, base_path, columns=channel_list)
    data.to_parquet(rd.outfile(cache_path))

### Add condition-level metadata

For instance, information about plasmids based on plasmid ID.

In [None]:
# Load metadata
metadata_path = rd.rootdir/'inputs'/'plasmid-metadata.csv'
metadata = pd.read_csv(metadata_path)
display(metadata)

In [None]:
# Add metadata to data
data = data.merge(metadata, how='left', on='construct')
display(data)

In [None]:
# Create columns with interpretable names,
# rather than the raw channel names from the Attune
data['marker'] = data['mGL-A']      # delivery marker
data['output'] = data['mRuby2-A']   # circuit output

# Draw gates

### Set up for plotting

In [None]:
# Set seaborn style
sns.set_style('ticks')
sns.set_context('talk', rc={'font.family': 'sans-serif', 'font.sans-serif':['Helvetica Neue']})

In [None]:
# Define color palettes for plotting based on 'ts_kind'
main_palette = {
    'na': 'black',  # base gene, untransfected
    'NT': 'grey',   # OL circuit
    'T': 'teal',    # CL circuit
}

dox_palette = {
    0: 'lightgrey',
    1000: 'darkorange',
}

# A modified continuous color palette, removing the hard-to-see yellow end
no_yellow_viridis = matplotlib.colors.ListedColormap(matplotlib.colormaps['viridis'](np.linspace(0,0.82,256)))

### Gate transduced cells

In [None]:
# Compute gates for each channel for each biorep
gates = data[data['construct']=='UT'].groupby('biorep')[['marker','output']].apply(lambda x: x.quantile(0.999)).reset_index()
display(gates)

In [None]:
# Plot 2D histogram to confirm gates look reasonable
x = 'marker'
y = 'output'
plot_df = data[(data.construct=='UT') & (data[x]>0) & (data[y]>0)].groupby('biorep').sample(1000)
g = sns.displot(data=plot_df, x=x, y=y, col='biorep', kind='kde',
                facet_kws=dict(margin_titles=True), log_scale=True, common_norm=False)

# Add reference lines corresponding to gates
for biorep, ax in g.axes_dict.items():
    ax.axvline(gates.loc[gates.biorep==biorep, x].mean(), color='black', zorder=0)
    ax.axhline(gates.loc[gates.biorep==biorep, y].mean(), color='black', zorder=0)

g.figure.savefig(rd.outfile(output_path/f'gates_untransduced_marker-output.png'))

In [None]:
# Add missing gate for biorep 3
gates.loc[len(gates.index)] = [3, gates['marker'].mean(), gates['output'].mean()] 
display(gates)

### Apply gates

This is probably not the optimal way to do this! Be on the lookout for a gating function in `rushd`.

In [None]:
# Gate on marker expression, separately for each biorep
def gate_data(df):
    gate = gates.loc[gates.biorep==df.biorep.values[0], 'marker'].values[0]
    data['gated_marker'] = (data['mGL-A']>gate) & (data.construct!='UT')
    return data

data = data.groupby('biorep')[data.columns].apply(gate_data).reset_index(drop=True)
data_gated = data[(data['gated_marker']) & (data['output']>0) & (data.construct!='UT')].copy()
display(data_gated)

In [None]:
# Alternatively, we could use the same gate for all bioreps
data_gated = data[(data['marker']>gates['marker'].mean()) & (data['output']>0) & (data.construct!='UT')].copy()
display(data_gated)

## Visualize distributions

In [None]:
x = 'marker'
y = 'output'
for biorep, group in data.groupby('biorep'):

    plot_df = group[(group[x]>0) & (group[y]>0) & (group.dox.isin([0,1000]))].groupby(['construct','dox']).sample(1000)

    g = sns.displot(data=plot_df, x=x, y=y, col='construct', col_wrap=4, kind='kde',
                    log_scale=True, common_norm=False, fill=False, levels=7,
                    hue='dox', palette=dox_palette)
    
    gate = gates[(gates['biorep']==biorep)]
    if gate.empty: continue

    for _, ax in g.axes_dict.items():
        ax.axvline(gate[x].values[0], color='black', zorder=0)
        ax.axhline(gate[y].values[0], color='black', zorder=0)

    g.figure.savefig(rd.outfile(output_path/f'kde_marker-output_biorep{biorep}.png'))

In [None]:
x = 'marker'
plot_df = data_gated[(data_gated.dox==1000)].groupby('construct').sample(1000)

g = sns.displot(data=plot_df, x=x, col='biorep', kind='kde',
                log_scale=True, common_norm=False, fill=False,
                hue='construct')

g.figure.savefig(rd.outfile(output_path/f'hist_{x}_by-biorep.png'))

In [None]:
x = 'output'
plot_df = data_gated[(data_gated.dox==1000)].groupby('construct').sample(1000)

g = sns.displot(data=plot_df, x=x, col='biorep', kind='kde',
                log_scale=True, common_norm=False, fill=False,
                hue='construct')

g.figure.savefig(rd.outfile(output_path/f'hist_{x}_by-biorep.png'))

In [None]:
display(metadata.columns)

In [None]:
display(data.drop_duplicates('construct')[['construct','name','group']])

In [None]:
x = 'output'
plot_df = data_gated[(data_gated.dox==1000) & (data_gated.biorep==2) & (plot_df.group=='controller')].groupby('construct').sample(1000)

g = sns.displot(data=plot_df, x=x, col='design', kind='kde',
                log_scale=True, common_norm=False, fill=False,
                hue='ts_kind', palette=main_palette)

g.figure.savefig(rd.outfile(output_path/f'hist_{x}_by-circuit.png'))

## Compute summary statistics

In [None]:
# Bin by marker quantiles
by = ['construct','dox','biorep']
num_bins = 15
data_gated['bin_marker_quantiles'] = data_gated.groupby(by)['marker'].transform(lambda x: pd.qcut(x, q=num_bins, duplicates='drop'))
quantiles = data_gated.groupby(by+['bin_marker_quantiles'])['marker'].median().rename('bin_marker_quantiles_median').reset_index()
data_gated = data_gated.merge(quantiles, how='left', on=by+['bin_marker_quantiles'])
display(data_gated)

In [None]:
# Plot quantiles as a line
#  x: bin median (marker)
#  y: gmean (output)
#  error: gmean/gstd to gmean*gstd
plot_df = data_gated[(data_gated.dox==1000) & (plot_df.group=='controller')]
g = sns.relplot(data=plot_df, x='bin_marker_quantiles_median', y='output', 
                hue='ts_kind', palette=main_palette,
                row='biorep', col='design', facet_kws=dict(margin_titles=True),
                kind='line', marker='o', estimator=sp.stats.gmean, 
                errorbar=lambda x: (sp.stats.gmean(x) / sp.stats.gstd(x), sp.stats.gmean(x) * sp.stats.gstd(x)))
g.set(xscale='log', yscale='log')

In [None]:
# Compute summary stats on bins
by = ['construct','dox','biorep']
stat_list = [sp.stats.gmean, np.std, sp.stats.variation]
stats = data_gated.groupby(by=by)[['marker','output']].agg(stat_list).reset_index().dropna()
stats.columns = ['_'.join(c).rstrip('_') for c in stats.columns.to_flat_index()]

stats_quantiles = data_gated.groupby(by=by+['bin_marker_quantiles'])[['marker','output']].agg(stat_list).reset_index().dropna()
stats_quantiles['bin_marker_quantiles_median'] = stats_quantiles['marker'].apply(np.median)
stats_quantiles.columns = ['_'.join(c).rstrip('_') for c in stats_quantiles.columns.to_flat_index()]

display(stats)

In [None]:
display(stats_quantiles)

In [None]:
def get_slope(df, x='bin_marker_quantiles_median_log', y='output_gmean_log'):
    slope, intercept, r_value, p_value, stderr = sp.stats.linregress(df[x], df[y])
    result = pd.DataFrame(columns=['slope', 'intercept', 'r_value', 'p_value', 'stderr'])
    result.loc[len(result.index)] = [slope, intercept, r_value, p_value, stderr]
    return result

stats_quantiles['bin_marker_quantiles_median_log'] = stats_quantiles['bin_marker_quantiles_median'].apply(np.log10)
stats_quantiles['output_gmean_log'] = stats_quantiles['output_gmean'].apply(np.log10)

# Calculate slope
fits = stats_quantiles.groupby(by)[stats_quantiles.columns].apply(get_slope).reset_index()
stats = stats.merge(fits, how='left', on=by)

display(stats)