In [7]:
# script that computes the BRET quantifications with an updated approach considering only the BRET50 and maxBRET

import db_utils
import pandas as pd
import numpy as np
import scipy.stats

connect = db_utils.get_connection()

path = '/Users/luck/IMB/projects/NDD_candidate_selection/BRET_titration_reanalysis/'

In [None]:
# create a table with information for each Mut pair and experiment that helps deciding if the mutant perturbed 
# binding or not

WT_pairs = [('REPS1','NUMB'),('REPS1','TRAPPC2L'),('CTBP1','CTBP2'),('CTBP1','DMRTB1'),('CTBP1','IKZF1'),('CTBP1','TGIF1'),('PPP3CA','FAM167A'),('PPP3CA','PPP3R2'),\
           ('WWOX','CPSF6'),('WWOX','CSNK2B'),('WWOX','DAZAP2'),('WWOX','HOXA1'),('WWOX','LITAF'),('WWOX','SNRPC'),('WWOX','IKZF1'),('IQCB1','CALM1'),\
           ('IQCB1','CALML3'),('IQCB1','LDOC1'),('IQCB1','MNS1'),('IQCB1','PCMT1'),('IQCB1','SPG21'),('SPOP','MYD88'),('SPOP','RXRB')]

# these ID pairs are paired with the entries in WT_pairs
WT_pair_ids = [('KL_510','KL_730'),('KL_510','KL_737'),('KL_519','KL_741'),('KL_519','KL_745'),('KL_519','KL_748'),('KL_519','KL_758'),('KL_520','KL_699'),('KL_520','KL_711'),\
               ('KL_524','KL_702'),('KL_524','KL_644'),('KL_524','KL_708'),('KL_524','KL_719'),('KL_524','KL_724'),('KL_524','KL_117'),('KL_524','KL_748'),('KL_525','KL_760'),\
               ('KL_525','KL_740'),('KL_525','KL_744'),('KL_525','KL_747'),('KL_525','KL_750'),('KL_525','KL_757'),('KL_533','KL_700'),('KL_533','KL_705')]

# load content from dalmira_db.titrations_filtering and put all entries from WT pairs into a separate dataframe
query = """select a.*, c.gene_symbol NL_symbol, d.gene_symbol mCit_symbol 
            from dalmira_db.titrations_filtering a, LUCK_DB.Luck_lab_plasmids c, LUCK_DB.Luck_lab_plasmids d  
            where a.NL_plasmid_id=c.plasmid_id and a.mCit_plasmid_id=d.plasmid_id"""
mut_df = pd.read_sql(query,connect)
WT_df = pd.DataFrame(columns=mut_df.columns.values)

for pair_id in WT_pair_ids:
    sub_df = mut_df.loc[(mut_df['NL_plasmid_id']==pair_id[0]) & (mut_df['mCit_plasmid_id']==pair_id[1]),]
    WT_df = pd.concat([WT_df,sub_df])
    mut_df.drop(mut_df.loc[(mut_df['NL_plasmid_id']==pair_id[0]) & (mut_df['mCit_plasmid_id']==pair_id[1]),].index,inplace=True)

# get a dataframe with information on the type of each mutation
query = """select a.plasmid_id, b.mut_name, b.description 
            from LUCK_DB.Luck_lab_plasmids a, 
            (select concat(template_plasmid_name,'_',mutation) mut_name, description from dalmira_db.mutant_description) as b 
            where a.plasmid_name=b.mut_name"""
mut_desc_df = pd.read_sql(query,connect)


In [9]:
final_df = mut_df[['project_id','NL_plasmid_id','mCit_plasmid_id','NL_plasmid','mCit_plasmid','NL_symbol','mCit_symbol']].drop_duplicates(ignore_index=True)

final_df['mut_type'] = np.nan

# find the corresponding WT pair for each Mut pair and fill the include and use fit info
final_df['NL_plasmid_id_WT'] = ''
final_df['mCit_plasmid_id_WT'] = ''
final_df['num_repl_include_MUT'] = 0
final_df['num_use_fit_MUT'] = 0
final_df['num_repl_include_WT'] = 0
final_df['num_use_fit_WT'] = 0
for i,row in final_df.iterrows():
    sub_df = mut_df.loc[(mut_df['project_id']==row['project_id']) & (mut_df['NL_plasmid_id']==row['NL_plasmid_id']) & \
                (mut_df['mCit_plasmid_id']==row['mCit_plasmid_id']) & (mut_df['include']==1),]
    final_df.at[i,'num_repl_include_MUT'] = sub_df.shape[0]

    sub_df = mut_df.loc[(mut_df['project_id']==row['project_id']) & (mut_df['NL_plasmid_id']==row['NL_plasmid_id']) & \
                (mut_df['mCit_plasmid_id']==row['mCit_plasmid_id']) & (mut_df['use_fit']==1),]
    final_df.at[i,'num_use_fit_MUT'] = sub_df.shape[0]

    WT_pair = (row['NL_symbol'],row['mCit_symbol'])
    WT_pair_id = WT_pair_ids[WT_pairs.index(WT_pair)]
    final_df.at[i,'NL_plasmid_id_WT'] = WT_pair_id[0]
    final_df.at[i,'mCit_plasmid_id_WT'] = WT_pair_id[1]

    sub_df = WT_df.loc[(WT_df['project_id']==row['project_id']) & (WT_df['NL_plasmid_id']==WT_pair_id[0]) & \
                (WT_df['mCit_plasmid_id']==WT_pair_id[1]) & (WT_df['include']==1),]
    final_df.at[i,'num_repl_include_WT'] = sub_df.shape[0]

    sub_df = WT_df.loc[(WT_df['project_id']==row['project_id']) & (WT_df['NL_plasmid_id']==WT_pair_id[0]) & \
                (WT_df['mCit_plasmid_id']==WT_pair_id[1]) & (WT_df['use_fit']==1),]
    final_df.at[i,'num_use_fit_WT'] = sub_df.shape[0]


In [10]:
# perform T-test on BRET50 where applicable and save information in dataframe
final_df['pval_bret50'] = np.nan
final_df['tstat_bret50'] = np.nan
final_df['avg_bret50_MUT'] = np.nan
final_df['std_bret50_MUT'] = np.nan
final_df['avg_bret50_WT'] = np.nan
final_df['std_bret50_WT'] = np.nan

for i,row in final_df.iterrows():

    if row['num_use_fit_WT'] >= 3 and row['num_use_fit_MUT'] >= 3:
        
        db_table = 'luthy_data.titration_fit'
        query = """select bret50 from %s where project_id=%%s and NL_plasmid_id=%%s and mCit_plasmid_id=%%s""" % (db_table)
        with connect.cursor() as cursor:
            cursor.execute(query,(row['project_id'],row['NL_plasmid_id'],row['mCit_plasmid_id']))
            results = cursor.fetchall()
            pair_bret50s = [r[0] for r in results]
    
        with connect.cursor() as cursor:
            cursor.execute(query,(row['project_id'],row['NL_plasmid_id_WT'],row['mCit_plasmid_id_WT']))
            results = cursor.fetchall()
            WT_pair_bret50s = [r[0] for r in results]

        test_result = scipy.stats.ttest_ind(WT_pair_bret50s,pair_bret50s)
        final_df.at[i,'pval_bret50'] = test_result.pvalue
        final_df.at[i,'tstat_bret50'] = test_result.statistic
        final_df.at[i,'avg_bret50_MUT'] = np.mean(pair_bret50s)
        final_df.at[i,'std_bret50_MUT'] = np.std(pair_bret50s)
        final_df.at[i,'avg_bret50_WT'] = np.mean(WT_pair_bret50s)
        final_df.at[i,'std_bret50_WT'] = np.std(WT_pair_bret50s)
        

In [None]:
# determine for every WT-MUT-experiment pair the highest expr_ratio measured -> split expr_ratio range into 4 bins 
# find the highest bin that has at least 3 measurements from WT and MUT but exclude the lowest expr bin
# compute for the measurements from this bin the significance of the differences of the cBRETs from WT vs MUT and save data
final_df['num_data_expr_bin_WT'] = 0
final_df['num_data_expr_bin_MUT'] = 0
final_df['lower_expr_bound'] = np.nan
final_df['upper_expr_bound'] = np.nan
final_df['num_bin'] = np.nan
final_df['avg_cBRET_WT'] = np.nan
final_df['std_cBRET_WT'] = np.nan
final_df['avg_cBRET_MUT'] = np.nan
final_df['std_cBRET_MUT'] = np.nan
final_df['pval_cBRET'] = np.nan
final_df['tstat_cBRET'] = np.nan

query = """select * from luthy_data.titration_values"""
tit_val_df = pd.read_sql(query,connect)

bin_num = 4
sliding_steps = 20

for i,row in final_df.iterrows():

    if row['num_repl_include_WT'] >= 3 and row['num_repl_include_MUT'] >= 3:

        sub_df = tit_val_df.loc[(tit_val_df['project_id']==row['project_id']) & (((tit_val_df['NL_plasmid_id']==row['NL_plasmid_id']) & \
                                (tit_val_df['mCit_plasmid_id']==row['mCit_plasmid_id'])) | ((tit_val_df['NL_plasmid_id']==row['NL_plasmid_id_WT']) & \
                                (tit_val_df['mCit_plasmid_id']==row['mCit_plasmid_id_WT']))),]
        sub_df['expr_ratio'] = sub_df['fluo']/sub_df['lumi']
        max_expr_ratio = np.max(sub_df['expr_ratio'])
        bin_width = max_expr_ratio/bin_num
        stepsize = max_expr_ratio/sliding_steps
        bins = []
        lower_bound = max_expr_ratio - bin_width
        upper_bound = max_expr_ratio
#        while lower_bound > bin_width:
        while lower_bound >= 0:
            bins.append((lower_bound,upper_bound))
            lower_bound = lower_bound - stepsize
            upper_bound = upper_bound - stepsize
        bins.append((lower_bound,upper_bound))
    
        for sel_bin in bins:
            sub_df_MUT = sub_df.loc[(sub_df['NL_plasmid_id']==row['NL_plasmid_id']) & (sub_df['mCit_plasmid_id']==row['mCit_plasmid_id']) & \
                                    (sub_df['expr_ratio']>sel_bin[0]) & (sub_df['expr_ratio']<=sel_bin[1]),]
            sub_df_WT = sub_df.loc[(sub_df['NL_plasmid_id']==row['NL_plasmid_id_WT']) & (sub_df['mCit_plasmid_id']==row['mCit_plasmid_id_WT']) & \
                                    (sub_df['expr_ratio']>sel_bin[0]) & (sub_df['expr_ratio']<=sel_bin[1]),]
    
            if sub_df_MUT.shape[0]>=3 and sub_df_WT.shape[0]>=3:
                final_df.at[i,'upper_expr_bound'] = sel_bin[1]
                final_df.at[i,'lower_expr_bound'] = sel_bin[0]
                final_df.at[i,'num_data_expr_bin_MUT'] = sub_df_MUT.shape[0]
                final_df.at[i,'num_data_expr_bin_WT'] = sub_df_WT.shape[0]
                final_df.at[i,'avg_cBRET_MUT'] = np.mean(sub_df_MUT['cBRET'])
                final_df.at[i,'std_cBRET_MUT'] = np.std(sub_df_MUT['cBRET'])
                final_df.at[i,'avg_cBRET_WT'] = np.mean(sub_df_WT['cBRET'])
                final_df.at[i,'std_cBRET_WT'] = np.std(sub_df_WT['cBRET'])
            
                test_result = scipy.stats.ttest_ind(sub_df_MUT['cBRET'],sub_df_WT['cBRET'])
                final_df.at[i,'pval_cBRET'] = test_result.pvalue
                final_df.at[i,'tstat_cBRET'] = test_result.statistic
    
                break


In [None]:
# add the mutation type information to the final_df dataframe
for i,row in final_df.iterrows():
    sub_df = mut_desc_df.loc[(mut_desc_df['plasmid_id']==row['NL_plasmid_id']) | (mut_desc_df['plasmid_id']==row['mCit_plasmid_id']),].reset_index()
    if sub_df.shape[0] > 0:
        final_df.at[i,'mut_type'] = sub_df.at[0,'description']

final_df.to_csv(path + 'titration_cat_scoring_v6.tsv',sep='\t',index=False)