In [None]:
import re
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

def min_max_norm(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

#User inputs; NOTE: First element in arrays will match with the lowest bin number etc.

input_file = 'Bins.tsv'
output_file = 'p_Bins.tsv'

delim = '\t'

aa_column  = 'aa_mutation' # Mutation column name 
n_aa_column = 'n_aa_substitutions' # Number of mutations column name
bin_column = 'bin' # Bin number column name
bc_column = 'barcode' # Barcode string column name

expr_bins = [1,2,3,4] # Bins with expression data
va_ref_bin = [5] # Reference bin for viscosity agent
va_bins = [6,8,10,12,14] # Positive bins for viscosity agent experiment
va_cons = [.0,.05,.1,.15,.2] # viscosity agent concentrations

wild_type = 'Z1000Z ' #Create an out of range variant for the wild-type

expr_scs  = min_max_norm(np.array([489,2902,8999,17745])) + 1 # Median expression in expression experiment

count_cells = np.array([16709793,3754589,2834337,2249482,2000000,187000,2000000,230000,1951798,340426,2018790,
                        506977,1582778,969420]) # number of total measured cells per bin
total_reads = np.array([32964833,30377459,25372847,29016725,9809899,16868280,24891324,22768756,14914290,10776556,
                        15074880,16724952,17855672,23342164]) # total number of reads per bin

amp_factors = count_cells / total_reads

In [None]:
df = pd.read_csv(input_file, delimiter=delim)

#cf = df.copy().fillna(wild_type)
cf = df.copy().replace(to_replace=r'^ $', value=wild_type, regex=True)

min_bin = cf[bin_column].min()

min_bin

cf['c_size'] = cf['size'].astype(float)

for i in cf.index:
    cf.at[i,'c_size'] = cf.at[i,'size'] * amp_factors[int(cf.at[i,bin_column] - min_bin)]
    
cf = cf.rename(columns={'size':'i_size'})


for bi in np.arange(-1,len(va_bins)):
    if bi == -1:
        tf = cf.query(f"{bin_column} in @expr_bins")
    else:
        curr_bin = np.atleast_1d(va_bins[bi])
        tf = cf.query(f"{bin_column} in @va_ref_bin or {bin_column} in @curr_bin")

    tm = tf[[aa_column,bc_column]].drop_duplicates().groupby(
        [aa_column], as_index=False
    ).size().rename(columns={'size':'n_barcodes'})
    tf = tf.merge(tm, on=[aa_column], how='outer')

    tm = tf[[aa_column,bin_column,'c_size']].groupby([aa_column,bin_column], as_index=False).sum()
    tf = tf.merge(tm, on=[aa_column,bin_column], how='outer')

    tf = tf.rename(columns={'c_size_x':'c_size','c_size_y':'cm_size'})

    tm = tf[[aa_column,'cm_size']].drop_duplicates().groupby([aa_column], as_index=False).sum()
    tf = tf.merge(tm, on=[aa_column], how='outer')

    tf = tf.rename(columns={'cm_size_x':'cm_size','cm_size_y':'cmt_size'})

    tm = tf[[bc_column,'c_size']].groupby([bc_column], as_index=False).sum()
    tf = tf.merge(tm, on=[bc_column], how='outer')

    tf = tf.rename(columns={'c_size_x':'c_size','c_size_y':'cb_size'})

    #Explained variation

    wm = lambda x: np.average(x, weights=rf.loc[x.index,'c_size'])

    rf = tf.assign(b_score=lambda x: x['c_size'] / x['cb_size']).copy()

    tm = rf[[aa_column,bin_column,'b_score']].groupby(
        [aa_column, bin_column], as_index=False
    ).agg(bb_score=('b_score',wm))
    rf = rf.merge(tm, on=[aa_column, bin_column], how='outer')

    rf = rf.assign(diff=lambda x: x['b_score'] - x['bb_score'])

    tm = rf[[aa_column,'diff','b_score']].groupby([aa_column]).agg(diff_avg=('diff',wm),true_avg=('b_score',wm))
    rf = rf.merge(tm, on=[aa_column], how='outer')

    rf = rf.assign(nom=lambda x: (x['b_score'] - x['bb_score'] - x['diff_avg'])**2)
    rf = rf.assign(denom=lambda x: (x['b_score'] - x['true_avg'])**2)

    tm = rf[[aa_column,'nom','denom']].groupby([aa_column]).agg(nom_avg=('nom',wm),denom_avg=('denom',wm))
    rf = rf.merge(tm, on=[aa_column], how='outer')

    rf['evs'] = rf.apply(lambda x: 1 if x['denom_avg'] == 0 else 1 - x['nom_avg'] / x['denom_avg'], axis=1)

    rf = rf.assign(
        conf=lambda x: (1-(1/1.5**np.clip(x['n_barcodes'],0,32))) * x['cmt_size'] * x['evs']
    ).reset_index(drop=True)

    an = rf[[aa_column,n_aa_column,'n_barcodes','cmt_size','conf','evs']].drop_duplicates()\
    .sort_values(by=['conf','cmt_size','n_barcodes','evs'], ascending=False).reset_index(drop=True)

    if bi == -1:
        antm = an.rename(
            columns={'n_barcodes':'exp_n_barcodes', 'cmt_size':'exp_cmt_size', 'conf':'exp_conf', 'evs':'exp_evs'}
        ).copy()
    else:
        an = an.rename(columns={'n_barcodes':f'{va_cons[bi]:.2f}_n_barcodes',
                                'cmt_size':f'{va_cons[bi]:.2f}_cmt_size',
                                'conf':f'{va_cons[bi]:.2f}_conf',
                                'evs':f'{va_cons[bi]:.2f}_evs'}).copy()
        
        antm = antm.merge(an, on=[aa_column, n_aa_column], how='outer')

m_cf = cf.copy().groupby([aa_column,n_aa_column,bin_column], as_index=False).sum()

ff = m_cf.query(f"{bin_column} in @expr_bins").copy()

for i in ff.index:
    ff.at[i,'weight'] = expr_scs[ff.at[i,bin_column]-1]

tm = ff[[aa_column,'c_size']].groupby([aa_column], as_index=False).sum()
ff = ff.merge(tm, on=[aa_column], how='outer')

ff = ff.assign(b_score=lambda x: x['weight'] * x['c_size_x'] / x['c_size_y'])

tm = ff[[aa_column,'b_score']].groupby([aa_column], as_index=False).sum()
ff = ff.merge(tm, on=[aa_column], how='outer')

wt_expr = ff.query(f"{aa_column} == '{wild_type}'")['b_score_y'].mean()

for i in ff.index:

    ff.at[i,'expr'] = np.log2( ff.at[i,'b_score_y'] / wt_expr ) if ff.at[i,'b_score_y'] != 0 else np.nan

ef = ff[[aa_column,'expr']].drop_duplicates()

full = antm.merge(ef, on=[aa_column], how='outer').copy()

for bi in np.arange(len(va_bins)):
    curr_bin = np.atleast_1d(va_bins[bi])
    ff = m_cf.query(
        f"{bin_column} in @va_ref_bin or {bin_column} in @curr_bin"
    ).copy().sort_values(by=[aa_column,bin_column])
    
    j=-1
    this_mut=''
    
    for i in ff.index:
        
        if ff.at[i,bin_column] in va_ref_bin:
            j=i
            this_mut = ff.at[i,aa_column]
        elif ff.at[i,bin_column] == va_bins[bi] and this_mut == ff.at[i,aa_column]:
            ff.at[i,'ratio'] = ff.at[i,'i_size'] / ff.at[j,'i_size']
            
    
    wt_enri = ff.query(f"{aa_column} == '{wild_type}'")['ratio'].sum()
    ff[f'{va_cons[bi]:.2f}_enrich'] = np.log2(ff['ratio'] / wt_enri)
    fftm = ff[[aa_column,f'{va_cons[bi]:.2f}_enrich']].dropna()
    full = full.merge(fftm, on=[aa_column], how='outer')
    
#full.sort_values(by=[aa_column]).reset_index(drop=True).to_csv(output_file,index=False)

af = full.copy()

for i in af.index:
    
    syn = []
    
    #arr = af.at[i,aa_column].findall(r'[A-Z*]+')
    arr = re.findall(r'[A-Z*]+', af.at[i,aa_column])
    
    if '*' in arr:
        af.at[i,'tag'] = 1
    else:
        af.at[i,'tag'] = 0
    
    for j in np.arange(0,len(arr),2):
        if arr[j] == arr[j+1] and arr[j] != 'Z':
            syn.append(1)
        else:
            syn.append(0)
    
    if np.sum(syn) == len(syn):
        af.at[i,'tag'] = 2
            
    
    aam = []
    
    for j,s in enumerate(af.at[i,'aa_mutation'].split()):
        if syn[j] == 0:
            aam.append(s)
            
    if len(aam) == 0:
        aam.append('Z1000Z')
        syn.append(-1)
        
    fs = ""
    for e in aam:
        fs += e+" "
    
    af.at[i,'aa_mutation_syn'] = fs
    af.at[i,'n_aa_substitutions_syn'] = af.at[i,'n_aa_substitutions'] - np.sum(syn)
    
af.sort_values(by=[aa_column]).reset_index(drop=True).to_csv(output_file,index=False)
af

In [None]:
import seaborn as sns
peg = '00'
g = sns.histplot(data=af.rename(columns={f'0.{peg}_evs':'evs', f'0.{peg}_cmt_size':'cmt_size', f'0.{peg}_conf':'conf'}).query(" evs != 1.0 and n_aa_substitutions_syn <= 1 and conf > 10"), x=f'0.{peg}_enrich', hue='tag', palette='tab10', )#multiple='stack')
g.set(xlim=(-10,2))

In [None]:
import seaborn as sns
sns.histplot(data=af.query(" exp_evs != 1.0 and exp_cmt_size > 1 and n_aa_substitutions_syn <= 1"), x='expr', hue='tag', palette='tab10', )#multiple='stack')