In [114]:
# This module is for filtering data and add promoter information

In [8]:
import pandas as pd
import scipy.stats
import numpy as np
import statsmodels.stats.multitest as STM
import math
import copy
import statistics as ST

## 0. Functions

In [9]:
def all_present_total_new(input_df,cutoff_DNA,cutoff_RNA,count_type):
    # count_type specify I am using absolute or relative count as a cutoff
    if count_type == 'absolute':
        temp_df=input_df[(input_df['DNA_absolute_count_S1']>=cutoff_DNA)&
                         (input_df['DNA_absolute_count_S2']>=cutoff_DNA)&
                         (input_df['DNA_absolute_count_S3']>=cutoff_DNA)&
                         (input_df['RNA_absolute_count_S1']>=cutoff_RNA)&
                         (input_df['RNA_absolute_count_S2']>=cutoff_RNA)&
                         (input_df['RNA_absolute_count_S3']>=cutoff_RNA)]
    else:
        temp_df=input_df[(input_df['DNA_relative_count_S1']>=cutoff_DNA)&
                         (input_df['DNA_relative_count_S2']>=cutoff_DNA)&
                         (input_df['DNA_relative_count_S3']>=cutoff_DNA)&
                         (input_df['RNA_relative_count_S1']>=cutoff_RNA)&
                         (input_df['RNA_relative_count_S2']>=cutoff_RNA)&
                         (input_df['RNA_relative_count_S3']>=cutoff_RNA)]
    return(temp_df)

In [10]:
def add_sample_specific_relative_expression(input_data_frame):
# This function will take a dataframe and filtere add Sample specific normalized relative expression
    input_data_frame['S1_normalized_relative']=input_data_frame['N_RNA_relative_count_S1']/input_data_frame['N_DNA_relative_count_S1']
    input_data_frame['S2_normalized_relative']=input_data_frame['N_RNA_relative_count_S2']/input_data_frame['N_DNA_relative_count_S2']
    input_data_frame['S3_normalized_relative']=input_data_frame['N_RNA_relative_count_S3']/input_data_frame['N_DNA_relative_count_S3']
    input_data_frame['Mean_normalized_relative']=(input_data_frame['S1_normalized_relative']+input_data_frame['S2_normalized_relative']+input_data_frame['S3_normalized_relative'])/3
    input_data_frame['SD_normalized_relative']=input_data_frame[['S1_normalized_relative','S2_normalized_relative','S3_normalized_relative']].apply(lambda x: ST.stdev(x), axis=1)
    #this i calculate the std for the expression between three samples
    input_data_frame['CV_normalized_relative']=input_data_frame['SD_normalized_relative']/input_data_frame['Mean_normalized_relative']
    return(input_data_frame)

In [11]:
def Expression_Preprocessing(temp):
# I add expression based on combined reads or individual reads
    temp['expression']=temp['RNA_relative_count_Total']/temp['DNA_relative_count_Total'] # add expression based on combined reads
    temp['expression_normalized']=temp['N_RNA_relative_count_Total']/temp['N_DNA_relative_count_Total'] # expression based on normalized combined relative reads
    temp = add_sample_specific_relative_expression(temp)
    temp['expression_normalized_absolute'] = temp['N_RNA_absolute_count_Total']/temp['N_DNA_absolute_count_Total']
    temp['S1_normalized_absolute'] = temp['N_RNA_absolute_count_S1']/temp['N_DNA_absolute_count_S1']
    temp['S2_normalized_absolute'] = temp['N_RNA_absolute_count_S2']/temp['N_DNA_absolute_count_S2']
    temp['S3_normalized_absolute'] = temp['N_RNA_absolute_count_S3']/temp['N_DNA_absolute_count_S3']
    temp['S1_absolute'] = temp['RNA_absolute_count_S1']/temp['DNA_absolute_count_S1']
    temp['S2_absolute'] = temp['RNA_absolute_count_S2']/temp['DNA_absolute_count_S2']
    temp['S3_absolute'] = temp['RNA_absolute_count_S3']/temp['DNA_absolute_count_S3']
    temp['S1_relative'] = temp['RNA_relative_count_S1']/temp['DNA_relative_count_S1']
    temp['S2_relative'] = temp['RNA_relative_count_S2']/temp['DNA_relative_count_S2']
    temp['S3_relative'] = temp['RNA_relative_count_S3']/temp['DNA_relative_count_S3']
    return(temp)


In [33]:
def summarize_expression_RD_median_new(input_df, cutoff_list_DNA,cutoff_list_RNA,percentile_list,input_fdr,
                                        pc_df, nc_df,conversion_factor,trait_of_interest,count_type):
    # I need to specify both DNA and RNA cutoff
    # this version also compare expression to different percentile of positive control
    total_barcode_t,sig_worse_than_pc,sig_worse_than_nc,sig_worse_than_pc_ratio,sig_worse_than_nc_ratio=[],[],[],[],[]
    sig_better_than_pc,sig_better_than_nc,sig_better_than_pc_ratio,sig_better_than_nc_ratio=[],[],[],[]
    
    #list of barcode sequence with significant expression
    
    sig_better_than_pc_list, sig_worse_than_pc_list, sig_better_than_nc_list, sig_worse_than_nc_list = [], [], [], []
    
    # list of barcode under each cutoff
    total_barcode_list = []
    for temp_cut_D in cutoff_list_DNA: # cutoff_list_DNA is the list of cutoff for DNA 
        for temp_cut_R in cutoff_list_RNA:
            # filter the data according to read count for DNA.
            test_df =all_present_total_new(input_df,temp_cut_D,temp_cut_R,count_type)
            
            # total number of unique barcode
            temp_total=test_df.shape[0]
            total_barcode_t.append(temp_total)
            
            temp = np.tile(test_df[trait_of_interest].tolist(),(pc_df.shape[0],1)).transpose()*conversion_factor/pc_df[trait_of_interest].tolist()
            temp_pc_fold_matrix = np.array(temp).transpose()
            
            temp = np.tile(test_df[trait_of_interest].tolist(),(nc_df.shape[0],1)).transpose()/nc_df[trait_of_interest].tolist()
            temp_nc_fold_matrix = np.array(temp).transpose()
            #nc control

            #t test 
            temp_t_test = scipy.stats.ttest_1samp(temp_nc_fold_matrix,1)
            temp_nc_statistic = temp_t_test.statistic
            temp_nc_pvalue= temp_t_test.pvalue        

            #FDR correction
            temp_nc_pvalue_adjusted=STM.multipletests(temp_nc_pvalue,alpha=input_fdr,method='fdr_bh')[0]
            temp_sig1 = temp_nc_pvalue_adjusted&(temp_nc_statistic<0) #the number of barcode that has significantly lower expression than negative control
            temp_sig_worse_nc_number=sum(temp_sig1) 
            temp_sig_worse_nc_list = list(test_df[temp_sig1].index)
            temp_sig2 = temp_nc_pvalue_adjusted&(temp_nc_statistic>0) #the number of barcode that has significantly higher expression than negative control
            temp_sig_better_nc_number=sum(temp_sig2)
            temp_sig_better_nc_list = list(test_df[temp_sig2].index)

            for temp_p in percentile_list:
            #pc control
                #using temp_p percentile of PC is equal to fold expression/temp_p
                temp_t_test = scipy.stats.ttest_1samp(temp_pc_fold_matrix/temp_p,1)
                temp_pc_statistic = temp_t_test.statistic
                temp_pc_pvalue= temp_t_test.pvalue



                # FDR correctiojn
                temp_pc_pvalue_adjusted=STM.multipletests(temp_pc_pvalue,alpha=input_fdr,method='fdr_bh')[0]
                temp_sig3 = temp_pc_pvalue_adjusted&(temp_pc_statistic<0)#the number of barcode that has significantly lower expression than positive control
                temp_sig_worse_pc_number=sum(temp_sig3) 
                temp_sig_worse_pc_list = list(test_df[temp_sig3].index)
                temp_sig4 = temp_pc_pvalue_adjusted&(temp_pc_statistic>0) #the number of barcode that has significantly higher expression than postive control
                temp_sig_better_pc_number=sum(temp_sig4)
                temp_sig_better_pc_list = list(test_df[temp_sig4].index)

                sig_worse_than_pc.append(temp_sig_worse_pc_number)
                sig_worse_than_pc_list.append(temp_sig_worse_pc_list)
                sig_worse_than_nc.append(temp_sig_worse_nc_number)
                sig_worse_than_nc_list.append(temp_sig_worse_nc_list)
                sig_better_than_pc.append(temp_sig_better_pc_number)
                sig_better_than_pc_list.append(temp_sig_better_pc_list)
                sig_better_than_nc.append(temp_sig_better_nc_number)
                sig_better_than_nc_list.append(temp_sig_better_nc_list)
                sig_worse_than_pc_ratio.append(temp_sig_worse_pc_number/temp_total)
                sig_worse_than_nc_ratio.append(temp_sig_worse_nc_number/temp_total)
                sig_better_than_pc_ratio.append(temp_sig_better_pc_number/temp_total)
                sig_better_than_nc_ratio.append(temp_sig_better_nc_number/temp_total)
                total_barcode_list.append(list(test_df.index))
    temp_df=pd.DataFrame({'DNA_cut_off':np.repeat(cutoff_list_DNA,len(percentile_list)*len(cutoff_list_RNA),axis=0),
                          'RNA_cut_off':list(np.repeat(cutoff_list_RNA,len(percentile_list),axis=0))*len(cutoff_list_DNA),
                          'Total_barcode':np.repeat(total_barcode_t,len(percentile_list),axis=0),
                          'Percentile_of_PC':percentile_list*(len(cutoff_list_DNA)*len(cutoff_list_RNA)),
                          'barcode_list': total_barcode_list,
                          'barcode_better_than_pc':sig_better_than_pc,'better_than_pc_ratio':sig_better_than_pc_ratio,
                          'barcode_better_than_pc_list':sig_better_than_pc_list,
                          'barcode_better_than_nc':sig_better_than_nc,'better_than_nc_ratio':sig_better_than_nc_ratio,
                          'barcode_better_than_nc_list':sig_better_than_nc_list,
                          'barcode_worse_than_pc':sig_worse_than_pc,'worse_than_pc_ratio':sig_worse_than_pc_ratio,
                          'barcode_worse_than_pc_list':sig_worse_than_pc_list,
                          'barcode_worse_than_nc':sig_worse_than_nc,'worse_than_nc_ratio':sig_worse_than_nc_ratio,
                          'barcode_worse_than_nc_list':sig_worse_than_nc_list,})
    return(temp_df)

-----

## 1. Input and Output

### 1.1 Input address

In [12]:
dr1='Data/'
Total_linkage_resolved_address=dr1+'Linkage_barcode_promoter_combined_Summary'#This let me the linkage information between barcode and promoter. No conflict 
# YPD expression info
YPD_expression_address=dr1+'YPD_final_sum_up_data_V2.csv'
YPD_pc_address=dr1+'YPD_final_positive_control_data_V2.csv'
YPD_nc_address=dr1+'YPD_final_negative_control_data_V2.csv'
# SCD expression info
SCD_expression_address=dr1+'SCD_final_sum_up_data.csv'
SCD_pc_address=dr1+'SCD_final_positive_control_data.csv'
SCD_nc_address=dr1+'SCD_final_negative_control_data.csv'

-----

## 2 Read data

### 2.1 YPD 

#### 2.1.1 Control expression

In [13]:
YPD_expression_pc=pd.read_csv(YPD_pc_address,index_col = 0)
YPD_expression_nc=pd.read_csv(YPD_nc_address,index_col = 0)

#### 2.1.2 Random promoter expression

In [14]:
YPD_expression = pd.read_csv(YPD_expression_address,index_col = 0)

In [15]:
YPD_expression.head()

Unnamed: 0,representative_sequence,barcode_cluster.x,DNA_absolute_count_S1,DNA_relative_count_S1,RNA_absolute_count_S1,RNA_relative_count_S1,N_DNA_absolute_count_S1,N_RNA_absolute_count_S1,N_DNA_relative_count_S1,N_RNA_relative_count_S1,...,N_DNA_relative_count_S3,N_RNA_relative_count_S3,DNA_absolute_count_Total,DNA_relative_count_Total,RNA_absolute_count_Total,RNA_relative_count_Total,N_DNA_absolute_count_Total,N_RNA_absolute_count_Total,N_DNA_relative_count_Total,N_RNA_relative_count_Total
1,AAAAAAAAAAGAGGATAGAG,4810769.0,5,2,0,0,1.178564e-07,0.0,7.400148e-08,0.0,...,0.0,0.0,5,2,0,0,1.178564e-07,0.0,7.400148e-08,0.0
2,AAAAAAAAAAGGCGGGGACT,0.0,0,0,0,0,0.0,0.0,0.0,0.0,...,2.546729e-08,0.0,1,1,0,0,1.579671e-08,0.0,2.546729e-08,0.0
3,AAAAAAAAAGATCTCCCGCG,0.0,0,0,0,0,0.0,0.0,0.0,0.0,...,2.546729e-08,0.0,1,1,0,0,1.579671e-08,0.0,2.546729e-08,0.0
4,AAAAAAAAAGCATCGTTAAT,0.0,0,0,0,0,0.0,0.0,0.0,0.0,...,2.546729e-08,0.0,1,1,0,0,1.579671e-08,0.0,2.546729e-08,0.0
5,AAAAAAAAAGGTTGGGAGCC,0.0,0,0,0,0,0.0,0.0,0.0,0.0,...,1.018692e-07,0.0,8,4,0,0,1.263737e-07,0.0,1.018692e-07,0.0


### 2.2 SCD 

#### 2.2.1 Control expression

In [16]:
SCD_expression_pc=pd.read_csv(SCD_pc_address,index_col = 0)
SCD_expression_nc=pd.read_csv(SCD_nc_address,index_col = 0)

#### 2.2.2 Random promoter expression

In [17]:
SCD_expression = pd.read_csv(SCD_expression_address,index_col = 0)

In [18]:
SCD_expression.head()

Unnamed: 0,representative_sequence,barcode_cluster.x,DNA_absolute_count_S1,DNA_relative_count_S1,RNA_absolute_count_S1,RNA_relative_count_S1,N_DNA_absolute_count_S1,N_RNA_absolute_count_S1,N_DNA_relative_count_S1,N_RNA_relative_count_S1,...,N_DNA_relative_count_S3,N_RNA_relative_count_S3,DNA_absolute_count_Total,DNA_relative_count_Total,RNA_absolute_count_Total,RNA_relative_count_Total,N_DNA_absolute_count_Total,N_RNA_absolute_count_Total,N_DNA_relative_count_Total,N_RNA_relative_count_Total
1,AAAAAAAAAAGGAAATCGAG,0.0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,1,0,0,1.438264e-08,0.0,2.026942e-08,0.0
2,AAAAAAAAACGAGTCACGGC,0.0,0,0,0,0,0.0,0.0,0.0,0.0,...,2.264358e-08,0.0,2,1,0,0,3.152629e-08,0.0,2.264358e-08,0.0
3,AAAAAAAAAGATCTTAGTGG,5090135.0,4,4,0,0,4.524322e-08,0.0,6.772024e-08,0.0,...,0.0,0.0,4,4,0,0,4.524322e-08,0.0,6.772024e-08,0.0
4,AAAAAAAAAGGATAGAAGGG,0.0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,7,5,0,0,1.006785e-07,0.0,1.013471e-07,0.0
5,AAAAAAAAAGTCAAGGGCTA,0.0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,2,1,0,0,2.876528e-08,0.0,2.026942e-08,0.0


----

## 3 Filter and preprocessing data

### 3.1 YPD

In [19]:
# I filter barcode based on absolute DNA count (>=1 for all replicates)
YPD_expression_f=copy.deepcopy(all_present_total_new(YPD_expression,1,0,'absolute'))

In [20]:
YPD_expression_f = Expression_Preprocessing(YPD_expression_f)
YPD_expression_pc = Expression_Preprocessing(YPD_expression_pc)
YPD_expression_nc = Expression_Preprocessing(YPD_expression_nc)

### 3.2 SCD

In [21]:
# I filter barcode based on absolute DNA count (>=1 for all replicates)
SCD_expression_f=copy.deepcopy(all_present_total_new(SCD_expression,1,0,'absolute'))

In [22]:
SCD_expression_f = Expression_Preprocessing(SCD_expression_f)
SCD_expression_pc = Expression_Preprocessing(SCD_expression_pc)
SCD_expression_nc = Expression_Preprocessing(SCD_expression_nc)

### 3.3 Output

In [23]:
# YPD Output
YPD_expression_f.to_csv(dr1+"YPD_RD_expression_total.csv", index=False)
YPD_expression_pc.to_csv(dr1 + "YPD_PC_expression.csv", index = False)
YPD_expression_nc.to_csv(dr1 + "YPD_NC_expression.csv", index = False)

In [24]:
# SCD Output
SCD_expression_f.to_csv(dr1+"SCD_RD_expression_total.csv", index=False)
SCD_expression_pc.to_csv(dr1 + "SCD_PC_expression.csv", index = False)
SCD_expression_nc.to_csv(dr1 + "SCD_NC_expression.csv", index = False)

-----

## 4 Add promoter information

In [25]:
#First I read the linkage information
df = pd.read_csv(Total_linkage_resolved_address,skiprows=1,header=None, usecols=range(0,6))
df.columns = ['P_Cluster_ID', 'B_Clustser_ID','P_Clustser_seq','representative_sequence','Total_count','Seq_ID'] # rename the columns

In [26]:
df.head()

Unnamed: 0,P_Cluster_ID,B_Clustser_ID,P_Clustser_seq,representative_sequence,Total_count,Seq_ID
0,28833,1,ATTACATGTGGAGTTTGCGGAGTGATTGTATAACGGGGAGTCGCGT...,GCTAGCGACTGGCGTCCACA,1,14881599
1,3053,2,TTCTGGCCCACGTATCGAATTTAACCGTAACTATGACTGGATATTT...,TGCATGAGTTGACTGAATGC,1,14881597
2,61262,3,TAAGTGTGGAGCTTTAATGGAGTGGTCGCAGATACTTTTTAGTCCA...,ATCCACGCGTACCGATTGGC,2,14881592:11223841
3,69034,4,GTCAATTCTATTTCTTGATAATCGCGATAATAGGATCGCTAGGTTC...,CCATCGCGGACGGCTCGGCG,1,14881589
4,7892,5,GGATGGATTGTTTTTGTCTCTATCGGTTTTCAAGTGTTCTTGCCGG...,CTCTCGCGGCACTCTTAGGC,1,14881575


### 4.1 YPD

In [27]:
# Add promoter information to YPD df
YPD_promoter_df = pd.merge(df, YPD_expression_f, on = 'representative_sequence', how = 'inner')

### 4.2 SCD

In [28]:
# Add promoter information to YPD df
SCD_promoter_df = pd.merge(df, SCD_expression_f, on = 'representative_sequence', how = 'inner')

### 4.3 Output

In [29]:
YPD_promoter_df.to_csv(dr1+'YPD_promoter_output.csv', index=False)

In [30]:
SCD_promoter_df.to_csv(dr1+'SCD_promoter_output.csv', index=False)

----

## 5 Random promoter expression comparing to different control

### 5.1 YPD

#### 5.1.1 Using different DNA cutoff

In [151]:
YPD_expression_f.head()

Unnamed: 0,representative_sequence,barcode_cluster.x,DNA_absolute_count_S1,DNA_relative_count_S1,RNA_absolute_count_S1,RNA_relative_count_S1,N_DNA_absolute_count_S1,N_RNA_absolute_count_S1,N_DNA_relative_count_S1,N_RNA_relative_count_S1,...,expression_normalized_absolute,S1_normalized_absolute,S2_normalized_absolute,S3_normalized_absolute,S1_absolute,S2_absolute,S3_absolute,S1_relative,S2_relative,S3_relative
10,AAAAAAAAGAGCCCACTGGC,2506774.0,268,200,254,11,6e-06,5.614577e-06,7e-06,1e-05,...,0.409121,0.88879,0.356824,0.112673,0.947761,0.160784,0.025,0.055,0.104712,0.018182
13,AAAAAAAAGCCTGAATAGAA,2290079.0,139,100,236,5,3e-06,5.216694e-06,4e-06,5e-06,...,0.759092,1.592199,0.079974,0.658317,1.697842,0.036036,0.146067,0.05,0.037037,0.066667
93,AAAAAAAGTGCTCAGCGAGT,2581991.0,256399,54967,2558235,40470,0.006044,0.05654885,0.002034,0.037781,...,6.564751,9.356734,5.658183,4.754514,9.977555,2.549568,1.054932,0.73626,1.069734,0.788911
207,AAAAAACCTGGAGACGCGGT,2297132.0,501,358,3,3,1.2e-05,6.63139e-08,1.3e-05,3e-06,...,0.033018,0.005615,0.085059,0.022507,0.005988,0.038328,0.004994,0.00838,0.034043,0.007576
369,AAAAAAGGCAAGCATAGACC,2749051.0,412,287,900,36,1e-05,1.989417e-05,1.1e-05,3.4e-05,...,1.300874,2.048545,1.607902,0.606792,2.184466,0.724518,0.134635,0.125436,0.280142,0.068389


In [152]:
YPD_expression_f = YPD_expression_f.set_index('representative_sequence')

In [153]:
# Total normalized relative expression
YPD_RD_summary= summarize_expression_RD_median_new(YPD_expression_f,
                                                      [1,5,10,20,50,100,200,400],[0],
                                                      list(np.linspace(1,10,10)/10),0.05,
                                                      YPD_expression_pc,YPD_expression_nc,1.906,'expression_normalized','absolute')

In [154]:
# relative size for individual sample using the same data filter as the above one 
YPD_RD_summary_S1= summarize_expression_RD_median_new(YPD_expression_f,
                                                      [1,5,10,20,50,100,200,400],[0],
                                                      list(np.linspace(1,10,10)/10),0.05,
                                                      YPD_expression_pc,YPD_expression_nc,1.906,'S1_normalized_relative','absolute')
YPD_RD_summary_S2= summarize_expression_RD_median_new(YPD_expression_f,
                                                      [1,5,10,20,50,100,200,400],[0],
                                                      list(np.linspace(1,10,10)/10),0.05,
                                                      YPD_expression_pc,YPD_expression_nc,1.906,'S2_normalized_relative','absolute')
YPD_RD_summary_S3= summarize_expression_RD_median_new(YPD_expression_f,
                                                      [1,5,10,20,50,100,200,400],[0],
                                                      list(np.linspace(1,10,10)/10),0.05,
                                                      YPD_expression_pc,YPD_expression_nc,1.906,'S3_normalized_relative','absolute')

#### 5.1.2 Comparing to different percentile of controls

In [155]:
test_range = list(np.linspace(0.5,20,40)/10)

In [156]:

# relative size for individual sample using the same data filter as the above one 

YPD_RD_summary_extra= summarize_expression_RD_median_new(YPD_expression_f,
                                                      [100],[0],
                                                      test_range,0.05,
                                                      YPD_expression_pc,YPD_expression_nc,1.906,'expression_normalized','absolute')
YPD_RD_summary_S1_extra= summarize_expression_RD_median_new(YPD_expression_f,
                                                      [100],[0],
                                                      test_range,0.05,
                                                      YPD_expression_pc,YPD_expression_nc,1.906,'S1_normalized_relative','absolute')
YPD_RD_summary_S2_extra= summarize_expression_RD_median_new(YPD_expression_f,
                                                      [100],[0],
                                                      test_range,0.05,
                                                      YPD_expression_pc,YPD_expression_nc,1.906,'S2_normalized_relative','absolute')
YPD_RD_summary_S3_extra= summarize_expression_RD_median_new(YPD_expression_f,
                                                     [100],[0],
                                                      test_range,0.05,
                                                      YPD_expression_pc,YPD_expression_nc,1.906,'S3_normalized_relative','absolute')

### 5.2 SCD

#### 5.2.1 Using different DNA cutoff

In [157]:
SCD_expression_f = SCD_expression_f.set_index('representative_sequence')

In [158]:
# Total normalized relative expression
SCD_RD_summary= summarize_expression_RD_median_new(SCD_expression_f,
                                                      [1,5,10,20,50,100,200,400],[0],
                                                      list(np.linspace(1,10,10)/10),0.05,
                                                      SCD_expression_pc,SCD_expression_nc,0.782,'expression_normalized','absolute')

In [159]:
# relative size for individual sample using the same data filter as the above one 
SCD_RD_summary_S1= summarize_expression_RD_median_new(SCD_expression_f,
                                                      [1,5,10,20,50,100,200,400],[0],
                                                      list(np.linspace(1,10,10)/10),0.05,
                                                      SCD_expression_pc,SCD_expression_nc,0.782,'S1_normalized_relative','absolute')
SCD_RD_summary_S2= summarize_expression_RD_median_new(SCD_expression_f,
                                                      [1,5,10,20,50,100,200,400],[0],
                                                      list(np.linspace(1,10,10)/10),0.05,
                                                      SCD_expression_pc,SCD_expression_nc,0.782,'S2_normalized_relative','absolute')
SCD_RD_summary_S3= summarize_expression_RD_median_new(SCD_expression_f,
                                                      [1,5,10,20,50,100,200,400],[0],
                                                      list(np.linspace(1,10,10)/10),0.05,
                                                      SCD_expression_pc,SCD_expression_nc,0.782,'S3_normalized_relative','absolute')

#### 5.2.2 Comparing to different percentile of controls

In [160]:
test_range = list(np.linspace(0.5,20,40)/10)

In [161]:

# relative size for individual sample using the same data filter as the above one 

SCD_RD_summary_extra= summarize_expression_RD_median_new(SCD_expression_f,
                                                      [100],[0],
                                                      test_range,0.05,
                                                      SCD_expression_pc,SCD_expression_nc,0.782,'expression_normalized','absolute')
SCD_RD_summary_S1_extra= summarize_expression_RD_median_new(SCD_expression_f,
                                                      [100],[0],
                                                      test_range,0.05,
                                                      SCD_expression_pc,SCD_expression_nc,0.782,'S1_normalized_relative','absolute')
SCD_RD_summary_S2_extra= summarize_expression_RD_median_new(SCD_expression_f,
                                                      [100],[0],
                                                      test_range,0.05,
                                                      SCD_expression_pc,SCD_expression_nc,0.782,'S2_normalized_relative','absolute')
SCD_RD_summary_S3_extra= summarize_expression_RD_median_new(SCD_expression_f,
                                                     [100],[0],
                                                      test_range,0.05,
                                                      SCD_expression_pc,SCD_expression_nc,0.782,'S3_normalized_relative','absolute')

### 5.3 Output

In [162]:
YPD_RD_summary.to_csv(
    dr1 + 'YPD_RD_summary_median_total_dna_cutoff_extra.csv',index = False)

In [43]:
YPD_RD_summary_S1.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(
    dr1 + 'YPD_RD_summary_median_S1_dna_cutoff_extra.csv',index = False)

In [44]:
YPD_RD_summary_S2.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(
    dr1 + 'YPD_RD_summary_median_S2_dna_cutoff_extra.csv',index = False)

In [45]:
YPD_RD_summary_S3.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(
    dr1 + 'YPD_RD_summary_median_S3_dna_cutoff_extra.csv',index = False)

In [163]:
YPD_RD_summary_extra.to_csv(dr1 + 'YPD_RD_summary_median_total_extra.csv',index = False)

In [47]:
YPD_RD_summary_S1_extra.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(dr1 + 'YPD_RD_summary_median_S1_extra.csv',index = False)
YPD_RD_summary_S2_extra.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(dr1 + 'YPD_RD_summary_median_S2_extra.csv',index = False)
YPD_RD_summary_S3_extra.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(dr1 + 'YPD_RD_summary_median_S3_extra.csv',index = False)

In [164]:
SCD_RD_summary.to_csv(
    dr1 + 'SCD_RD_summary_median_total_dna_cutoff_extra.csv',index = False)

In [130]:
SCD_RD_summary_S1.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(
    dr1 + 'SCD_RD_summary_median_S1_dna_cutoff_extra.csv',index = False)

In [131]:
SCD_RD_summary_S2.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(
    dr1 + 'SCD_RD_summary_median_S2_dna_cutoff_extra.csv',index = False)

In [132]:
SCD_RD_summary_S3.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(
    dr1 + 'SCD_RD_summary_median_S3_dna_cutoff_extra.csv',index = False)

In [165]:
SCD_RD_summary_extra.to_csv(dr1 + 'SCD_RD_summary_median_total_extra.csv',index = False)

In [133]:
SCD_RD_summary_S1_extra.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(dr1 + 'SCD_RD_summary_median_S1_extra.csv',index = False)
SCD_RD_summary_S2_extra.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(dr1 + 'SCD_RD_summary_median_S2_extra.csv',index = False)
SCD_RD_summary_S3_extra.drop(columns =['barcode_list','barcode_better_than_pc_list','barcode_better_than_nc_list'
                                 ,'barcode_worse_than_pc_list','barcode_worse_than_nc_list']).to_csv(dr1 + 'SCD_RD_summary_median_S3_extra.csv',index = False)

## 6 Shared promoters across method

### 6.1 YPD

In [59]:
temp_input_0 = YPD_RD_summary
temp_input_1 = YPD_RD_summary_S1
temp_input_2 = YPD_RD_summary_S2
temp_input_3 = YPD_RD_summary_S3

In [79]:
temp_out_ad1 = dr1 + 'ACRF_random_promoter_summary_YPD.csv'
temp_out_ad2 = dr1 + 'ACRF_random_promoter_shared_more_than_pc_YPD.csv'
temp_out_ad3 = dr1 + 'ACRF_random_promoter_shared_more_than_nc_YPD.csv'

In [61]:
temp = temp_input_0
temp = copy.deepcopy(temp)
temp['Method'] = ['Total_expression']*temp.shape[0]

temp_s_df = temp

temp = copy.deepcopy(temp_input_1)

temp['Method'] = ['S1']*temp.shape[0]
temp_s_df = pd.concat([temp_s_df.reset_index(drop = True),temp.reset_index(drop = True)])

temp = copy.deepcopy(temp_input_2)

temp['Method'] = ['S2']*temp.shape[0]
temp_s_df = pd.concat([temp_s_df.reset_index(drop = True),temp.reset_index(drop = True)])

temp = copy.deepcopy(temp_input_3)

temp['Method'] = ['S3']*temp.shape[0]
temp_s_df = pd.concat([temp_s_df.reset_index(drop = True),temp.reset_index(drop = True)])

In [62]:
temp_s_df = temp_s_df.reset_index(drop = True)

In [63]:
temp_traits2= ['Method','DNA_cut_off','Total_barcode',
               'better_than_pc_ratio','better_than_nc_ratio',
              'worse_than_pc_ratio','worse_than_nc_ratio',
               'barcode_better_than_pc','barcode_better_than_nc',
              'barcode_worse_than_pc','barcode_worse_than_nc']

In [64]:
temp_percentile_cut = [1.0]
temp_cut_DNA = [100]
temp_cut_RNA = [0]
temp_YPD_df = temp_s_df
temp_o = temp_YPD_df[(temp_YPD_df['Percentile_of_PC'].isin(temp_percentile_cut))&(temp_YPD_df['RNA_cut_off'].isin(temp_cut_RNA))&(temp_YPD_df['DNA_cut_off'].isin(temp_cut_DNA))]
temp_o[temp_traits2]

Unnamed: 0,Method,DNA_cut_off,Total_barcode,better_than_pc_ratio,better_than_nc_ratio,worse_than_pc_ratio,worse_than_nc_ratio,barcode_better_than_pc,barcode_better_than_nc,barcode_worse_than_pc,barcode_worse_than_nc
59,Total_expression,100,49169,0.000244,0.632065,0.999349,0.25329,12,31078,49137,12454
139,S1,100,49169,0.000488,0.490878,0.997905,0.430739,24,24136,49066,21179
219,S2,100,49169,0.001912,0.621082,0.996197,0.313734,94,30538,48982,15426
299,S3,100,49169,0.000346,0.510403,0.999471,0.359088,17,25096,49143,17656


* <font size="5" color = 'Red'> They are similar to the total expression results</font>

#### 6.1.1 Shared fraction for PC

In [73]:
# compare to positive control

In [74]:
temp_percentile_cut = 1.0
temp_cut_DNA = 100
temp_cut_RNA = 0
temp_i = 'barcode_better_than_pc_list'
temp_YPD_df = temp_input_0
t0 = temp_YPD_df[(temp_YPD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_YPD_df['RNA_cut_off']==temp_cut_RNA)&(temp_YPD_df['DNA_cut_off']==temp_cut_DNA)]
temp_YPD_df = temp_input_1
t1 = temp_YPD_df[(temp_YPD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_YPD_df['RNA_cut_off']==temp_cut_RNA)&(temp_YPD_df['DNA_cut_off']==temp_cut_DNA)]
temp_YPD_df = temp_input_2
t2 = temp_YPD_df[(temp_YPD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_YPD_df['RNA_cut_off']==temp_cut_RNA)&(temp_YPD_df['DNA_cut_off']==temp_cut_DNA)]
temp_YPD_df = temp_input_3
t3 = temp_YPD_df[(temp_YPD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_YPD_df['RNA_cut_off']==temp_cut_RNA)&(temp_YPD_df['DNA_cut_off']==temp_cut_DNA)]

# total barcode list
temp_total_bc = list(set(t0[temp_i].values[0]))
#print(len(temp_total_bc))

temp_S1_bc = list(set(t1[temp_i].values[0]))
#print(len(temp_S1_bc))

temp_S2_bc = list(set(t2[temp_i].values[0]))
#print(len(temp_S2_bc))

temp_S3_bc = list(set(t3[temp_i].values[0]))
#print(len(temp_S3_bc))
#print('\n')

temp_S1_bc_o = set(temp_total_bc)&set(temp_S1_bc)
#print(len(temp_S1_bc_o)/len(temp_S1_bc))
#print(len(temp_S1_bc_o)/len(temp_total_bc))
#print('\n')
temp_S2_bc_o = set(temp_total_bc)&set(temp_S2_bc)
#print(len(temp_S2_bc_o)/len(temp_S2_bc))
#print(len(temp_S2_bc_o)/len(temp_total_bc))
#print('\n')
temp_S3_bc_o = set(temp_total_bc)&set(temp_S3_bc)
#print(len(temp_S3_bc_o)/len(temp_S3_bc))
#print(len(temp_S3_bc_o)/len(temp_total_bc))
#print('\n')

#print(len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc)))
#print(len(set(temp_S1_bc)|set(temp_S2_bc)|set(temp_S3_bc))/t3['Total_barcode'].values[0]) # at most
#print(len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc))/t3['Total_barcode'].values[0]) # at least

In [75]:
temp_shared_fraction_df_pc = pd.DataFrame({'Method':['S1','S2','S3'], # the method use to generate the summary 
                                          'Number_of_total_barcode':[len(temp_total_bc)]*3,
                                          'Number_of_barcode':[len(temp_S1_bc),len(temp_S2_bc),len(temp_S3_bc)],
                                          'Number_of_barcode_overlap_with_Total_method':[len(temp_S1_bc_o),len(temp_S2_bc_o),len(temp_S3_bc_o)],
                                          'Number_of_overlap_among_IndividualMethod':[len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc))]*3,})


In [76]:
temp_shared_fraction_df_pc['Fraction_of_overlap_with_TotalMethod'] = temp_shared_fraction_df_pc['Number_of_barcode_overlap_with_Total_method']/temp_shared_fraction_df_pc['Number_of_barcode']
temp_shared_fraction_df_pc['Fraction_TotalMethod_overlap_with'] = temp_shared_fraction_df_pc['Number_of_barcode_overlap_with_Total_method']/temp_shared_fraction_df_pc['Number_of_total_barcode'].values[0]
temp_shared_fraction_df_pc['Fraction_of_overlap_among_IndividualMethod'] = temp_shared_fraction_df_pc['Number_of_overlap_among_IndividualMethod']/temp_shared_fraction_df_pc['Number_of_barcode']
temp_shared_fraction_df_pc['Number_of_barcode_unique'] = temp_shared_fraction_df_pc['Number_of_barcode'] - temp_shared_fraction_df_pc['Number_of_barcode_overlap_with_Total_method']

##### Output

In [77]:
temp_shared_fraction_df_pc.to_csv(temp_out_ad2,index =False)

#### 6.1.2 Shared fraction for NC

In [195]:
# compare to negative control

In [80]:
temp_percentile_cut = 1.0
temp_cut_DNA = 100
temp_cut_RNA = 0
temp_i = 'barcode_better_than_nc_list'
temp_YPD_df = temp_input_0
t0 = temp_YPD_df[(temp_YPD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_YPD_df['RNA_cut_off']==temp_cut_RNA)&(temp_YPD_df['DNA_cut_off']==temp_cut_DNA)]
temp_YPD_df = temp_input_1
t1 = temp_YPD_df[(temp_YPD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_YPD_df['RNA_cut_off']==temp_cut_RNA)&(temp_YPD_df['DNA_cut_off']==temp_cut_DNA)]
temp_YPD_df = temp_input_2
t2 = temp_YPD_df[(temp_YPD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_YPD_df['RNA_cut_off']==temp_cut_RNA)&(temp_YPD_df['DNA_cut_off']==temp_cut_DNA)]
temp_YPD_df = temp_input_3
t3 = temp_YPD_df[(temp_YPD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_YPD_df['RNA_cut_off']==temp_cut_RNA)&(temp_YPD_df['DNA_cut_off']==temp_cut_DNA)]

# total barcode list
temp_total_bc = list(set(t0[temp_i].values[0]))
#print(len(temp_total_bc))

temp_S1_bc = list(set(t1[temp_i].values[0]))
#print(len(temp_S1_bc))

temp_S2_bc = list(set(t2[temp_i].values[0]))
#print(len(temp_S2_bc))

temp_S3_bc = list(set(t3[temp_i].values[0]))
#print(len(temp_S3_bc))
#print('\n')

temp_S1_bc_o = set(temp_total_bc)&set(temp_S1_bc)
#print(len(temp_S1_bc_o)/len(temp_S1_bc))
#print(len(temp_S1_bc_o)/len(temp_total_bc))
#print('\n')
temp_S2_bc_o = set(temp_total_bc)&set(temp_S2_bc)
#print(len(temp_S2_bc_o)/len(temp_S2_bc))
#print(len(temp_S2_bc_o)/len(temp_total_bc))
#print('\n')
temp_S3_bc_o = set(temp_total_bc)&set(temp_S3_bc)
#print(len(temp_S3_bc_o)/len(temp_S3_bc))
#print(len(temp_S3_bc_o)/len(temp_total_bc))
#print('\n')

#print(len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc)))
#print(len(set(temp_S1_bc)|set(temp_S2_bc)|set(temp_S3_bc))/t3['Total_barcode'].values[0]) # at most
#print(len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc))/t3['Total_barcode'].values[0]) # at least

In [81]:
temp_shared_fraction_df_nc = pd.DataFrame({'Method':['S1','S2','S3'], # the method use to generate the summary 
                                          'Number_of_total_barcode':[len(temp_total_bc)]*3,
                                          'Number_of_barcode':[len(temp_S1_bc),len(temp_S2_bc),len(temp_S3_bc)],
                                          'Number_of_barcode_overlap_with_Total_method':[len(temp_S1_bc_o),len(temp_S2_bc_o),len(temp_S3_bc_o)],
                                          'Number_of_overlap_among_IndividualMethod':[len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc))]*3,})


In [82]:
temp_shared_fraction_df_nc['Fraction_of_overlap_with_TotalMethod'] = temp_shared_fraction_df_nc['Number_of_barcode_overlap_with_Total_method']/temp_shared_fraction_df_nc['Number_of_barcode']
temp_shared_fraction_df_nc['Number_of_barcode_overlap_with_Total_method']/temp_shared_fraction_df_nc['Number_of_total_barcode'].values[0]
temp_shared_fraction_df_nc['Fraction_of_overlap_among_IndividualMethod'] = temp_shared_fraction_df_nc['Number_of_overlap_among_IndividualMethod']/temp_shared_fraction_df_nc['Number_of_barcode']
temp_shared_fraction_df_nc['Number_of_barcode_unique'] = temp_shared_fraction_df_nc['Number_of_barcode'] - temp_shared_fraction_df_nc['Number_of_barcode_overlap_with_Total_method']

##### Output

In [83]:
temp_shared_fraction_df_nc.to_csv(temp_out_ad3,index =False)

### 6.2 SCD

In [134]:
temp_input_0 = SCD_RD_summary
temp_input_1 = SCD_RD_summary_S1
temp_input_2 = SCD_RD_summary_S2
temp_input_3 = SCD_RD_summary_S3

In [135]:
temp_out_ad1 = dr1 + 'ACRF_random_promoter_summary_SCD.csv'
temp_out_ad2 = dr1 + 'ACRF_random_promoter_shared_more_than_pc_SCD.csv'
temp_out_ad3 = dr1 + 'ACRF_random_promoter_shared_more_than_nc_SCD.csv'

In [136]:
temp = temp_input_0
temp = copy.deepcopy(temp)
temp['Method'] = ['Total_expression']*temp.shape[0]

temp_s_df = temp

temp = copy.deepcopy(temp_input_1)

temp['Method'] = ['S1']*temp.shape[0]
temp_s_df = pd.concat([temp_s_df.reset_index(drop = True),temp.reset_index(drop = True)])

temp = copy.deepcopy(temp_input_2)

temp['Method'] = ['S2']*temp.shape[0]
temp_s_df = pd.concat([temp_s_df.reset_index(drop = True),temp.reset_index(drop = True)])

temp = copy.deepcopy(temp_input_3)

temp['Method'] = ['S3']*temp.shape[0]
temp_s_df = pd.concat([temp_s_df.reset_index(drop = True),temp.reset_index(drop = True)])

In [137]:
temp_s_df = temp_s_df.reset_index(drop = True)

In [138]:
temp_traits2= ['Method','DNA_cut_off','Total_barcode',
               'better_than_pc_ratio','better_than_nc_ratio',
              'worse_than_pc_ratio','worse_than_nc_ratio',
               'barcode_better_than_pc','barcode_better_than_nc',
              'barcode_worse_than_pc','barcode_worse_than_nc']

In [139]:
temp_percentile_cut = [1.0]
temp_cut_DNA = [100]
temp_cut_RNA = [0]
temp_SCD_df = temp_s_df
temp_o = temp_SCD_df[(temp_SCD_df['Percentile_of_PC'].isin(temp_percentile_cut))&(temp_SCD_df['RNA_cut_off'].isin(temp_cut_RNA))&(temp_SCD_df['DNA_cut_off'].isin(temp_cut_DNA))]
temp_o[temp_traits2]

Unnamed: 0,Method,DNA_cut_off,Total_barcode,better_than_pc_ratio,better_than_nc_ratio,worse_than_pc_ratio,worse_than_nc_ratio,barcode_better_than_pc,barcode_better_than_nc,barcode_worse_than_pc,barcode_worse_than_nc
59,Total_expression,100,146291,0.000294,0.41345,0.999556,0.43017,43,60484,146226,62930
139,S1,100,146291,0.000465,0.337246,0.999125,0.581546,68,49336,146163,85075
219,S2,100,146291,0.000561,0.330512,0.998899,0.608445,82,48351,146130,89010
299,S3,100,146291,0.000492,0.355087,0.999077,0.600386,72,51946,146156,87831


* <font size="5" color = 'Red'> They are similar to the total expression results</font>

#### 6.2.1 Shared fraction for PC

In [140]:
# compare to positive control

In [141]:
temp_percentile_cut = 1.0
temp_cut_DNA = 100
temp_cut_RNA = 0
temp_i = 'barcode_better_than_pc_list'
temp_SCD_df = temp_input_0
t0 = temp_SCD_df[(temp_SCD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_SCD_df['RNA_cut_off']==temp_cut_RNA)&(temp_SCD_df['DNA_cut_off']==temp_cut_DNA)]
temp_SCD_df = temp_input_1
t1 = temp_SCD_df[(temp_SCD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_SCD_df['RNA_cut_off']==temp_cut_RNA)&(temp_SCD_df['DNA_cut_off']==temp_cut_DNA)]
temp_SCD_df = temp_input_2
t2 = temp_SCD_df[(temp_SCD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_SCD_df['RNA_cut_off']==temp_cut_RNA)&(temp_SCD_df['DNA_cut_off']==temp_cut_DNA)]
temp_SCD_df = temp_input_3
t3 = temp_SCD_df[(temp_SCD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_SCD_df['RNA_cut_off']==temp_cut_RNA)&(temp_SCD_df['DNA_cut_off']==temp_cut_DNA)]

# total barcode list
temp_total_bc = list(set(t0[temp_i].values[0]))
#print(len(temp_total_bc))

temp_S1_bc = list(set(t1[temp_i].values[0]))
#print(len(temp_S1_bc))

temp_S2_bc = list(set(t2[temp_i].values[0]))
#print(len(temp_S2_bc))

temp_S3_bc = list(set(t3[temp_i].values[0]))
#print(len(temp_S3_bc))
#print('\n')

temp_S1_bc_o = set(temp_total_bc)&set(temp_S1_bc)
#print(len(temp_S1_bc_o)/len(temp_S1_bc))
#print(len(temp_S1_bc_o)/len(temp_total_bc))
#print('\n')
temp_S2_bc_o = set(temp_total_bc)&set(temp_S2_bc)
#print(len(temp_S2_bc_o)/len(temp_S2_bc))
#print(len(temp_S2_bc_o)/len(temp_total_bc))
#print('\n')
temp_S3_bc_o = set(temp_total_bc)&set(temp_S3_bc)
#print(len(temp_S3_bc_o)/len(temp_S3_bc))
#print(len(temp_S3_bc_o)/len(temp_total_bc))
#print('\n')

#print(len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc)))
#print(len(set(temp_S1_bc)|set(temp_S2_bc)|set(temp_S3_bc))/t3['Total_barcode'].values[0]) # at most
#print(len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc))/t3['Total_barcode'].values[0]) # at least

In [142]:
temp_shared_fraction_df_pc = pd.DataFrame({'Method':['S1','S2','S3'], # the method use to generate the summary 
                                          'Number_of_total_barcode':[len(temp_total_bc)]*3,
                                          'Number_of_barcode':[len(temp_S1_bc),len(temp_S2_bc),len(temp_S3_bc)],
                                          'Number_of_barcode_overlap_with_Total_method':[len(temp_S1_bc_o),len(temp_S2_bc_o),len(temp_S3_bc_o)],
                                          'Number_of_overlap_among_IndividualMethod':[len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc))]*3,})


In [143]:
temp_shared_fraction_df_pc['Fraction_of_overlap_with_TotalMethod'] = temp_shared_fraction_df_pc['Number_of_barcode_overlap_with_Total_method']/temp_shared_fraction_df_pc['Number_of_barcode']
temp_shared_fraction_df_pc['Fraction_TotalMethod_overlap_with'] = temp_shared_fraction_df_pc['Number_of_barcode_overlap_with_Total_method']/temp_shared_fraction_df_pc['Number_of_total_barcode'].values[0]
temp_shared_fraction_df_pc['Fraction_of_overlap_among_IndividualMethod'] = temp_shared_fraction_df_pc['Number_of_overlap_among_IndividualMethod']/temp_shared_fraction_df_pc['Number_of_barcode']
temp_shared_fraction_df_pc['Number_of_barcode_unique'] = temp_shared_fraction_df_pc['Number_of_barcode'] - temp_shared_fraction_df_pc['Number_of_barcode_overlap_with_Total_method']

##### Output

In [144]:
temp_shared_fraction_df_pc.to_csv(temp_out_ad2,index =False)

#### 6.2.2 Shared fraction for NC

In [145]:
# compare to negative control

In [146]:
temp_percentile_cut = 1.0
temp_cut_DNA = 100
temp_cut_RNA = 0
temp_i = 'barcode_better_than_nc_list'
temp_SCD_df = temp_input_0
t0 = temp_SCD_df[(temp_SCD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_SCD_df['RNA_cut_off']==temp_cut_RNA)&(temp_SCD_df['DNA_cut_off']==temp_cut_DNA)]
temp_SCD_df = temp_input_1
t1 = temp_SCD_df[(temp_SCD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_SCD_df['RNA_cut_off']==temp_cut_RNA)&(temp_SCD_df['DNA_cut_off']==temp_cut_DNA)]
temp_SCD_df = temp_input_2
t2 = temp_SCD_df[(temp_SCD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_SCD_df['RNA_cut_off']==temp_cut_RNA)&(temp_SCD_df['DNA_cut_off']==temp_cut_DNA)]
temp_SCD_df = temp_input_3
t3 = temp_SCD_df[(temp_SCD_df['Percentile_of_PC']==temp_percentile_cut)&(temp_SCD_df['RNA_cut_off']==temp_cut_RNA)&(temp_SCD_df['DNA_cut_off']==temp_cut_DNA)]

# total barcode list
temp_total_bc = list(set(t0[temp_i].values[0]))
#print(len(temp_total_bc))

temp_S1_bc = list(set(t1[temp_i].values[0]))
#print(len(temp_S1_bc))

temp_S2_bc = list(set(t2[temp_i].values[0]))
#print(len(temp_S2_bc))

temp_S3_bc = list(set(t3[temp_i].values[0]))
#print(len(temp_S3_bc))
#print('\n')

temp_S1_bc_o = set(temp_total_bc)&set(temp_S1_bc)
#print(len(temp_S1_bc_o)/len(temp_S1_bc))
#print(len(temp_S1_bc_o)/len(temp_total_bc))
#print('\n')
temp_S2_bc_o = set(temp_total_bc)&set(temp_S2_bc)
#print(len(temp_S2_bc_o)/len(temp_S2_bc))
#print(len(temp_S2_bc_o)/len(temp_total_bc))
#print('\n')
temp_S3_bc_o = set(temp_total_bc)&set(temp_S3_bc)
#print(len(temp_S3_bc_o)/len(temp_S3_bc))
#print(len(temp_S3_bc_o)/len(temp_total_bc))
#print('\n')

#print(len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc)))
#print(len(set(temp_S1_bc)|set(temp_S2_bc)|set(temp_S3_bc))/t3['Total_barcode'].values[0]) # at most
#print(len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc))/t3['Total_barcode'].values[0]) # at least

In [147]:
temp_shared_fraction_df_nc = pd.DataFrame({'Method':['S1','S2','S3'], # the method use to generate the summary 
                                          'Number_of_total_barcode':[len(temp_total_bc)]*3,
                                          'Number_of_barcode':[len(temp_S1_bc),len(temp_S2_bc),len(temp_S3_bc)],
                                          'Number_of_barcode_overlap_with_Total_method':[len(temp_S1_bc_o),len(temp_S2_bc_o),len(temp_S3_bc_o)],
                                          'Number_of_overlap_among_IndividualMethod':[len(set(temp_S1_bc)&set(temp_S2_bc)&set(temp_S3_bc))]*3,})


In [148]:
temp_shared_fraction_df_nc['Fraction_of_overlap_with_TotalMethod'] = temp_shared_fraction_df_nc['Number_of_barcode_overlap_with_Total_method']/temp_shared_fraction_df_nc['Number_of_barcode']
temp_shared_fraction_df_nc['Number_of_barcode_overlap_with_Total_method']/temp_shared_fraction_df_nc['Number_of_total_barcode'].values[0]
temp_shared_fraction_df_nc['Fraction_of_overlap_among_IndividualMethod'] = temp_shared_fraction_df_nc['Number_of_overlap_among_IndividualMethod']/temp_shared_fraction_df_nc['Number_of_barcode']
temp_shared_fraction_df_nc['Number_of_barcode_unique'] = temp_shared_fraction_df_nc['Number_of_barcode'] - temp_shared_fraction_df_nc['Number_of_barcode_overlap_with_Total_method']

##### Output

In [149]:
temp_shared_fraction_df_nc.to_csv(temp_out_ad3,index =False)