In [1]:
import itertools as itt
import pathlib as pl
from configparser import ConfigParser

import numpy as np
import pandas as pd
import joblib as jl

import plotly.express as px
import plotly.graph_objects as go

from src.root_path import config_path

In [2]:
config = ConfigParser()
config.read_file(open(config_path / 'settings.ini'))
meta = {'reliability': 0.1,  # r value
        'smoothing_window': 0,  # ms
        'raster_fs': 30,
        'montecarlo': 1000,
        'zscore': True,
        'dprime_absolute': None,
        'stim_type': 'permutations',
        'alpha':0.05}
# todo, if batch analysis rerun, use the anotated line instead
# summary_DF_file = pl.Path(config['paths']['analysis_cache']) / f'211221_cxt_metrics_summary_DF_alpha_{meta}'
summary_DF_file = pl.Path(config['paths']['analysis_cache']) / '211221_cxt_metrics_summary_DF_alpha_0.05'

### same example cell as in figure 1 ###
prb_idx = 3 - 1# selected probe. the -1 is to acount for 0 not being used
ctx_pair = [0,1] # pair of contexts to compare and exemplify d'
cellid = 'ARM021b-36-8'



DF = jl.load(summary_DF_file)
def format_dataframe(DF):

    ff_analylis = DF.analysis.isin(['SC', 'fdPCA'])
    ff_corr = DF.mult_comp_corr == 'consecutive_3'

    good_cols =['analysis', 'mult_comp_corr', 'region', 'siteid',  'cellid', 'context_pair',
                'probe', 'metric', 'value']
    filtered = DF.loc[ff_analylis & ff_corr, good_cols]

    filtered['probe'] = [int(p) for p in filtered['probe']]
    filtered['context_pair'] = [f"{int(cp.split('_')[0]):02d}_{int(cp.split('_')[1]):02d}"
                                for cp in filtered['context_pair']]

    # rename metrics and analysis for ease of ploting
    filtered['metric'] = filtered['metric'].replace({'significant_abs_mass_center': 'center of mass (ms)',
                                                     'significant_abs_mean': "mean d'",
                                                     'significant_abs_sum': "integral (d'*ms)"})
    filtered['analysis'] = filtered['analysis'].replace({'SC': 'single cell',
                                                         'fdPCA': 'population',
                                                         'pdPCA': 'probewise pop',
                                                         'LDA': 'pop ceiling'})

    filtered['id'] = filtered['cellid'].fillna(value=filtered['siteid'])
    filtered = filtered.drop(columns=['cellid', 'siteid'])

    filtered['value'] = filtered['value'].fillna(value=0)

    # permutation related preprocesing.
    # creates a new column relating probe with  context pairs
    ctx = np.asarray([row.split('_') for row in filtered.context_pair], dtype=int)
    prb = np.asarray(filtered.probe, dtype=int)

    silence = ctx == 0
    same = ctx == prb[:,None]
    different = np.logical_and(~silence, ~same)

    name_arr = np.full_like(ctx, np.nan, dtype=object)
    name_arr[silence] = 'silence'
    name_arr[same] = 'same'
    name_arr[different] = 'diff'
    comp_name_arr = np.apply_along_axis('_'.join, 1, name_arr)

    # swaps clasification names to not have repetitions i.e. diff_same == same_diff
    comp_name_arr[np.where(comp_name_arr == 'same_silence')] = 'silence_same'
    comp_name_arr[np.where(comp_name_arr == 'diff_silence')] = 'silence_diff'
    comp_name_arr[np.where(comp_name_arr == 'diff_same')] = 'same_diff'
    comp_name_arr[np.where(comp_name_arr == 'same_silence')] = 'silence_same'

    filtered['trans_pair'] = comp_name_arr

    ord_cols = ['analysis', 'region', 'id', 'context_pair', 'trans_pair', 'probe', 'metric', 'value']
    pivot_idx = [col for col in ord_cols if col not in ['value', 'metric']]
    pivoted = filtered.pivot_table(index=pivot_idx, columns='metric', values='value', aggfunc='first').reset_index()

    full_long = filtered # saves long format for subsamplig analysis

    return pivoted, full_long
pivoted, filtered = format_dataframe(DF)

In [3]:
def nozero_percentage(arr):
    return np.sum(arr>0) / np.size(arr) * 100

# filteres dataframe and adds required columns
filtered = pivoted.loc[pivoted.analysis=='single cell', :]
filtered['site'] = filtered.id.apply(lambda x: x[:7])

ctx_pairs = [pair.split('_') for pair in filtered.context_pair]
ctx_pairs = np.stack(ctx_pairs, axis=0)
filtered['ctx_0'] = ctx_pairs[:, 0]
filtered['ctx_1'] = ctx_pairs[:, 1]

# function to aggregate each single cell. gives the proportion of significant instances
agg_funcs = {"signif_proportion": ("integral (d'*ms)", nozero_percentage)}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['site'] = filtered.id.apply(lambda x: x[:7])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['ctx_0'] = ctx_pairs[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['ctx_1'] = ctx_pairs[:, 1]


In [4]:
# iterates over all sizes of context subsampling, i.e. 2, 3, 4 and 5 contexts.
ctx_subsampling_path = pl.Path(config['paths']['analysis_cache']) / '220203_ctx_subsampling_DF'
recache_ctx = False
if (not ctx_subsampling_path.exists()) or recache_ctx:

    all_contexts = np.unique(ctx_pairs)
    ctx_subsamp_signif = pd.DataFrame()

    for num_ctx in range(2, len(all_contexts)+1):
        n_ctx_groups  = list(itt.combinations(all_contexts, num_ctx))
        # iterates over all possible combinations of n probes
        for cg, ctx_group in enumerate(n_ctx_groups):
            ctx_df = filtered.loc[(filtered.ctx_0.isin(ctx_group)) &
                                     ((filtered.ctx_1.isin(ctx_group))), :].set_index('region', 'site', 'id')
            grouped  = ctx_df.groupby(['region', 'site', 'id']).agg(**agg_funcs).copy()
            grouped.loc[grouped.signif_proportion == 0] = np.nan
            grouped['n_contexts'] = num_ctx
            grouped['ctx_group'] = '_'.join(ctx_group)
            ctx_subsamp_signif = ctx_subsamp_signif.append(grouped.reset_index())

    jl.dump(ctx_subsamp_signif, ctx_subsampling_path)
    print(f'ctx_subsamp_signif chached at {ctx_subsampling_path}')
else:
    print(f'loading ctx_subsamp_signif from {ctx_subsampling_path}')
    ctx_subsamp_signif = jl.load(ctx_subsampling_path)

display(ctx_subsampling_path)

ctx_subsamp_signif chached at /auto/users/mateo/code/context_probe_analysis/data/220203_ctx_subsampling_DF


PosixPath('/auto/users/mateo/code/context_probe_analysis/data/220203_ctx_subsampling_DF')

In [5]:
# iterates over all sizes of probe subsampling, i.e. 1, 2, 3 and 4 probes.
prb_subsampling_path = pl.Path(config['paths']['analysis_cache']) / '220203_prb_subsampling_DF'
recache_prb = False
if (not prb_subsampling_path.exists())  or recache_prb:
    print(f'creating prb_subsamp_signif dataframe')
    all_probes = filtered.probe.sort_values().unique()
    prb_subsamp_signif = pd.DataFrame()

    for num_probe in range(len(all_probes)):
        num_probe += 1

        n_probe_groups  = list(itt.combinations(all_probes, num_probe))
        # iterates over all possible combinations of n probes
        for pg, probe_group in enumerate(n_probe_groups):
            probe_group = [int(p) for p in probe_group]
            probes_df = filtered.loc[filtered.probe.isin(probe_group), :].set_index('region', 'site', 'id')
            grouped  = probes_df.groupby(['region', 'site', 'id']).agg(**agg_funcs).copy()
            grouped.loc[grouped.signif_proportion == 0] = np.nan
            grouped['n_probes'] = num_probe
            grouped['prb_group'] = '_'.join([str(p) for p in probe_group])
            prb_subsamp_signif = prb_subsamp_signif.append(grouped.reset_index())


    jl.dump(prb_subsamp_signif, prb_subsampling_path)
    print(f'prb_subsamp_signif chached at {prb_subsampling_path}')

else:
    print(f'loading prb_subsamp_signif from {prb_subsampling_path}')
    prb_subsamp_signif = jl.load(prb_subsampling_path)

display(prb_subsamp_signif)

creating prb_subsamp_signif dataframe
prb_subsamp_signif chached at /auto/users/mateo/code/context_probe_analysis/data/220203_prb_subsampling_DF


Unnamed: 0,region,site,id,signif_proportion,n_probes,prb_group
0,A1,AMT020a,AMT020a-02-1,,1,1
1,A1,AMT020a,AMT020a-04-1,,1,1
2,A1,AMT020a,AMT020a-07-1,,1,1
3,A1,AMT020a,AMT020a-08-1,,1,1
4,A1,AMT020a,AMT020a-13-1,,1,1
...,...,...,...,...,...,...
1064,PEG,CRD014b,CRD014b-24-2,7.5,10,1_2_3_4_5_6_7_8_9_10
1065,PEG,CRD014b,CRD014b-27-1,10.0,10,1_2_3_4_5_6_7_8_9_10
1066,PEG,CRD014b,CRD014b-27-2,12.5,10,1_2_3_4_5_6_7_8_9_10
1067,PEG,CRD014b,CRD014b-27-3,7.5,10,1_2_3_4_5_6_7_8_9_10


In [19]:
agg_funcs = {"contextual space coverage": ("integral (d'*ms)", nozero_percentage)}

ctx_prb_count = pivoted.set_index(['analysis', 'region', 'id']
                              ).groupby(['analysis', 'region', 'id']
                              ).agg({"integral (d'*ms)": nozero_percentage}).reset_index()

print(ctx_prb_count)

         analysis region            id  integral (d'*ms)
0      population     A1       AMT020a              52.5
1      population     A1       AMT021b              45.0
2      population     A1       AMT026a              37.5
3      population     A1       ARM005e              30.0
4      population     A1       ARM029a              55.0
...           ...    ...           ...               ...
1108  single cell    PEG  CRD014b-24-2               7.5
1109  single cell    PEG  CRD014b-27-1              10.0
1110  single cell    PEG  CRD014b-27-2              12.5
1111  single cell    PEG  CRD014b-27-3               7.5
1112  single cell    PEG  CRD014b-29-1              20.0

[1113 rows x 4 columns]


In [22]:

hist_data = ctx_prb_count.loc[ctx_prb_count.analysis =='single cell', :]

fig = px.histogram(data_frame=hist_data, x="integral (d'*ms)", histnorm='percent')
# fig.update_xaxes(title=dict(text='contextual space coverage (%)'))
fig.show()