In [1]:
# This module is for filtering data and add promoter information for SCD

## 1 Functions and module

### 1.1 Modules

In [1]:
import pandas as pd
import numpy as np
import math
import copy
import statistics as ST

### 1.2 Functions

In [2]:
def Read_Random_Promoter_with_DNA_only(input_address):
    # read random promoter with only DNA reads
    temp_df= pd.read_csv(input_address)
    temp_df['RNA_absolute_count'] = 0 # give 0 to RNA reads
    temp_df['RNA_relative_count'] = 0
    return(temp_df)

In [3]:
def Read_Random_Promoter_with_Both(input_address):
    # read random promoter with Both reads
    temp_df= pd.read_csv(input_address,
                         usecols = ['barcode_cluster', 'representative_sequence',
                                    'DNA_absolute_count', 'DNA_relative_count',
                                    'RNA_absolute_count', 'RNA_relative_count',])
    return(temp_df)

In [4]:
def Calculate_Total_Count(input_address,input_p):
    temp = pd.read_csv(input_address, sep=';')
    return(temp[input_p].sum())

In [5]:
def all_present_total_new(input_df,cutoff_DNA,cutoff_RNA,count_type):
    # count_type specify I am using absolute or relative count as a cutoff
    if count_type == 'absolute':
        temp_df=input_df[(input_df['DNA_absolute_count_S1']>=cutoff_DNA)&
                         (input_df['DNA_absolute_count_S2']>=cutoff_DNA)&
                         (input_df['DNA_absolute_count_S3']>=cutoff_DNA)&
                         (input_df['RNA_absolute_count_S1']>=cutoff_RNA)&
                         (input_df['RNA_absolute_count_S2']>=cutoff_RNA)&
                         (input_df['RNA_absolute_count_S3']>=cutoff_RNA)]
    else:
        temp_df=input_df[(input_df['DNA_relative_count_S1']>=cutoff_DNA)&
                         (input_df['DNA_relative_count_S2']>=cutoff_DNA)&
                         (input_df['DNA_relative_count_S3']>=cutoff_DNA)&
                         (input_df['RNA_relative_count_S1']>=cutoff_RNA)&
                         (input_df['RNA_relative_count_S2']>=cutoff_RNA)&
                         (input_df['RNA_relative_count_S3']>=cutoff_RNA)]
    return(temp_df)

In [6]:
def add_sample_specific_relative_expression(input_df):
# This function will take a dataframe and filtere add Sample specific normalized relative expression
    input_data_frame = input_df.copy()
    input_data_frame['S1_normalized_relative']=input_data_frame['N_RNA_relative_count_S1']/input_data_frame['N_DNA_relative_count_S1']
    input_data_frame['S2_normalized_relative']=input_data_frame['N_RNA_relative_count_S2']/input_data_frame['N_DNA_relative_count_S2']
    input_data_frame['S3_normalized_relative']=input_data_frame['N_RNA_relative_count_S3']/input_data_frame['N_DNA_relative_count_S3']
    input_data_frame['Mean_normalized_relative']=(input_data_frame['S1_normalized_relative']+input_data_frame['S2_normalized_relative']+input_data_frame['S3_normalized_relative'])/3
    input_data_frame['SD_normalized_relative']=input_data_frame[['S1_normalized_relative','S2_normalized_relative','S3_normalized_relative']].apply(lambda x: ST.stdev(x), axis=1)
    #this i calculate the std for the expression between three samples
    input_data_frame['CV_normalized_relative']=input_data_frame['SD_normalized_relative']/input_data_frame['Mean_normalized_relative']
    return(input_data_frame)

In [7]:
def Expression_Preprocessing(input_df):
# I add expression based on combined reads or individual reads
    temp = input_df.copy()
    temp['expression']=temp['RNA_relative_count_Total']/temp['DNA_relative_count_Total'] # add expression based on combined reads
    temp['expression_normalized']=temp['N_RNA_relative_count_Total']/temp['N_DNA_relative_count_Total'] # expression based on normalized combined relative reads
    temp = add_sample_specific_relative_expression(temp)
    temp['expression_normalized_absolute'] = temp['N_RNA_absolute_count_Total']/temp['N_DNA_absolute_count_Total']
    temp['S1_normalized_absolute'] = temp['N_RNA_absolute_count_S1']/temp['N_DNA_absolute_count_S1']
    temp['S2_normalized_absolute'] = temp['N_RNA_absolute_count_S2']/temp['N_DNA_absolute_count_S2']
    temp['S3_normalized_absolute'] = temp['N_RNA_absolute_count_S3']/temp['N_DNA_absolute_count_S3']
    temp['S1_absolute'] = temp['RNA_absolute_count_S1']/temp['DNA_absolute_count_S1']
    temp['S2_absolute'] = temp['RNA_absolute_count_S2']/temp['DNA_absolute_count_S2']
    temp['S3_absolute'] = temp['RNA_absolute_count_S3']/temp['DNA_absolute_count_S3']
    temp['S1_relative'] = temp['RNA_relative_count_S1']/temp['DNA_relative_count_S1']
    temp['S2_relative'] = temp['RNA_relative_count_S2']/temp['DNA_relative_count_S2']
    temp['S3_relative'] = temp['RNA_relative_count_S3']/temp['DNA_relative_count_S3']
    return(temp)


-----

## 2 Input and output address

### 2.1 Input address

In [8]:
dr1='Data/'
# YPD
# data with at only DNA count
D_S1_address = dr1 + 'N6_Q10_total_S1_d.csv'
D_S2_address = dr1 + 'N6_Q10_total_S2_d.csv'
D_S3_address = dr1 + 'N6_Q10_total_S3_d.csv'
# data with both DNA count and RNA count
B_S1_address = dr1 + 'N6_Q10_total_S1_b.csv'
B_S2_address = dr1 + 'N6_Q10_total_S2_b.csv'
B_S3_address = dr1 + 'N6_Q10_total_S3_b.csv'

# total DNA count
Total_DNA_S1_address = dr1 + 'N6_Q10_total_DNA_S1.csv'
Total_DNA_S2_address = dr1 + 'N6_Q10_total_DNA_S2.csv'
Total_DNA_S3_address = dr1 + 'N6_Q10_total_DNA_S3.csv'
# total RNA count

Total_RNA_S1_address = dr1 + 'N6_Q10_total_RNA_S1.csv'
Total_RNA_S2_address = dr1 + 'N6_Q10_total_RNA_S2.csv'
Total_RNA_S3_address = dr1 + 'N6_Q10_total_RNA_S3.csv'

In [9]:
Total_linkage_resolved_address=dr1+'Linkage_barcode_promoter_combined_Summary'#This let me the linkage information between barcode and promoter. No conflict 

In [10]:
total_output_address = dr1 + 'SCD_RD_expression_total.csv'
PC_output_address = dr1 + 'SCD_PC_expression.csv'
NC_output_address = dr1 + 'SCD_NC_expression.csv'
promoter_output_address = dr1 +'SCD_promoter_output.csv'

-----

## 3 Proprocessing data

### 3.1 Read Raw data

In [11]:
D_S1_df = Read_Random_Promoter_with_DNA_only(D_S1_address)
D_S2_df = Read_Random_Promoter_with_DNA_only(D_S2_address)
D_S3_df = Read_Random_Promoter_with_DNA_only(D_S3_address)

In [12]:
B_S1_df = Read_Random_Promoter_with_Both(B_S1_address)
B_S2_df = Read_Random_Promoter_with_Both(B_S2_address)
B_S3_df = Read_Random_Promoter_with_Both(B_S3_address)

In [13]:
S1_df = pd.concat([D_S1_df.reset_index(drop=True),
                   B_S1_df.reset_index(drop=True)]).reset_index(drop=True)

In [14]:
S2_df = pd.concat([D_S2_df.reset_index(drop=True),
                   B_S2_df.reset_index(drop=True)]).reset_index(drop=True)

In [15]:
S3_df = pd.concat([D_S3_df.reset_index(drop=True),
                   B_S3_df.reset_index(drop=True)]).reset_index(drop=True)

In [16]:
# total reltaive count for DNA or RNA in each sample
TC_DNA_S1=Calculate_Total_Count(Total_DNA_S1_address,'absolute_count')
TC_DNA_S2=Calculate_Total_Count(Total_DNA_S2_address,'absolute_count')
TC_DNA_S3=Calculate_Total_Count(Total_DNA_S3_address,'absolute_count')
TC_RNA_S1=Calculate_Total_Count(Total_RNA_S1_address,'absolute_count')
TC_RNA_S2=Calculate_Total_Count(Total_RNA_S2_address,'absolute_count')
TC_RNA_S3=Calculate_Total_Count(Total_RNA_S3_address,'absolute_count')

In [17]:
# total reltaive count for DNA or RNA in each sample
TC_DNA_S1_r=Calculate_Total_Count(Total_DNA_S1_address,'relative_count')
TC_DNA_S2_r=Calculate_Total_Count(Total_DNA_S2_address,'relative_count')
TC_DNA_S3_r=Calculate_Total_Count(Total_DNA_S3_address,'relative_count')
TC_RNA_S1_r=Calculate_Total_Count(Total_RNA_S1_address,'relative_count')
TC_RNA_S2_r=Calculate_Total_Count(Total_RNA_S2_address,'relative_count')
TC_RNA_S3_r=Calculate_Total_Count(Total_RNA_S3_address,'relative_count')

### 3.2 Raw data preprocessing and combine

In [18]:
S1_df["N_DNA_absolute_count"] = S1_df["DNA_absolute_count"]/TC_DNA_S1
S1_df["N_RNA_absolute_count"] = S1_df["RNA_absolute_count"]/TC_RNA_S1
S2_df["N_DNA_absolute_count"] = S2_df["DNA_absolute_count"]/TC_DNA_S2
S2_df["N_RNA_absolute_count"] = S2_df["RNA_absolute_count"]/TC_RNA_S2
S3_df["N_DNA_absolute_count"] = S3_df["DNA_absolute_count"]/TC_DNA_S3
S3_df["N_RNA_absolute_count"] = S3_df["RNA_absolute_count"]/TC_RNA_S3

In [19]:
S1_df["N_DNA_relative_count"] = S1_df["DNA_relative_count"]/TC_DNA_S1_r
S1_df["N_RNA_relative_count"] = S1_df["RNA_relative_count"]/TC_RNA_S1_r
S2_df["N_DNA_relative_count"] = S2_df["DNA_relative_count"]/TC_DNA_S2_r
S2_df["N_RNA_relative_count"] = S2_df["RNA_relative_count"]/TC_RNA_S2_r
S3_df["N_DNA_relative_count"] = S3_df["DNA_relative_count"]/TC_DNA_S3_r
S3_df["N_RNA_relative_count"] = S3_df["RNA_relative_count"]/TC_RNA_S3_r

In [20]:
# add sample suffix
S1_df = S1_df.rename(columns={c: c+'_S1' for c in S1_df.columns if c not in ['barcode_cluster', 'representative_sequence']})
S2_df = S2_df.rename(columns={c: c+'_S2' for c in S2_df.columns if c not in ['barcode_cluster', 'representative_sequence']})
S3_df = S3_df.rename(columns={c: c+'_S3' for c in S3_df.columns if c not in ['barcode_cluster', 'representative_sequence']})

In [21]:
# merge three sample
temp_df = S1_df.merge(S2_df, on=['barcode_cluster', 'representative_sequence'], how = 'outer')
Raw_df = temp_df.merge(S3_df, on=['barcode_cluster', 'representative_sequence'], how = 'outer')
del temp_df
Raw_df = Raw_df.fillna(0)

In [22]:
Raw_df["DNA_absolute_count_Total"] = Raw_df['DNA_absolute_count_S1'] + Raw_df['DNA_absolute_count_S2'] + Raw_df['DNA_absolute_count_S3']
Raw_df["DNA_relative_count_Total"] = Raw_df['DNA_relative_count_S1'] + Raw_df['DNA_relative_count_S2'] + Raw_df['DNA_relative_count_S3']
Raw_df["RNA_absolute_count_Total"] = Raw_df['RNA_absolute_count_S1'] + Raw_df['RNA_absolute_count_S2'] + Raw_df['RNA_absolute_count_S3']
Raw_df["RNA_relative_count_Total"] = Raw_df['RNA_relative_count_S1'] + Raw_df['RNA_relative_count_S2'] + Raw_df['RNA_relative_count_S3']

In [23]:
Raw_df["N_DNA_absolute_count_Total"] = Raw_df['N_DNA_absolute_count_S1'] + Raw_df['N_DNA_absolute_count_S2'] + Raw_df['N_DNA_absolute_count_S3']
Raw_df["N_DNA_relative_count_Total"] = Raw_df['N_DNA_relative_count_S1'] + Raw_df['N_DNA_relative_count_S2'] + Raw_df['N_DNA_relative_count_S3']
Raw_df["N_RNA_absolute_count_Total"] = Raw_df['N_RNA_absolute_count_S1'] + Raw_df['N_RNA_absolute_count_S2'] + Raw_df['N_RNA_absolute_count_S3']
Raw_df["N_RNA_relative_count_Total"] = Raw_df['N_RNA_relative_count_S1'] + Raw_df['N_RNA_relative_count_S2'] + Raw_df['N_RNA_relative_count_S3']

In [24]:
Raw_df.head()

Unnamed: 0,barcode_cluster,representative_sequence,DNA_absolute_count_S1,DNA_relative_count_S1,RNA_absolute_count_S1,RNA_relative_count_S1,N_DNA_absolute_count_S1,N_RNA_absolute_count_S1,N_DNA_relative_count_S1,N_RNA_relative_count_S1,...,N_DNA_relative_count_S3,N_RNA_relative_count_S3,DNA_absolute_count_Total,DNA_relative_count_Total,RNA_absolute_count_Total,RNA_relative_count_Total,N_DNA_absolute_count_Total,N_DNA_relative_count_Total,N_RNA_absolute_count_Total,N_RNA_relative_count_Total
0,1,TACGTGCAAGCTATTTAGAG,1.0,1.0,0.0,0.0,1.13108e-08,0.0,1.693006e-08,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.13108e-08,1.693006e-08,0.0,0.0
1,4,ACGCCGATTGAGAGTTCTGG,1.0,1.0,0.0,0.0,1.13108e-08,0.0,1.693006e-08,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.13108e-08,1.693006e-08,0.0,0.0
2,8,GCTATAGTGATGACGGCTAG,1.0,1.0,0.0,0.0,1.13108e-08,0.0,1.693006e-08,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.13108e-08,1.693006e-08,0.0,0.0
3,9,TTCTCTTCACTATGTGGTCA,1.0,1.0,0.0,0.0,1.13108e-08,0.0,1.693006e-08,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.13108e-08,1.693006e-08,0.0,0.0
4,10,TTGGGAGGTGGGCTTTGGCG,1.0,1.0,0.0,0.0,1.13108e-08,0.0,1.693006e-08,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.13108e-08,1.693006e-08,0.0,0.0


### 3.3 Identify negative and positive control

In [25]:
# n_control = [1769576,2281214,2698104,4741707,2328337,1871693,3375381,2102647] 
# p_control = [2140910,3304444,3556204,3944581,2442367,3375101,1981999]
nc_list = ['TATAGTGGGCATTAGTAGAC','CTCGATCGTCTAGGGCGAGG',
           'GACTGGAAAGCGGTGCGGTG','AGTGTAGGTGGGGGTATCCA',
           'ATGAGCGAGGCGTTGTCGAA','TGATTTAATTACCGCGTAGT',
           'AAGCGCATGGAGTCATCCGT', 'AGGTCCGTCGGCGATGTCAG']
pc_list = ['GTCTAACGCTAGCTTGGTAG','CAGTAGAGGTGTACGTGTGT',
           'CGGGGACCGAGGCGAGATGC','TTATTGAACTGTAGCGACTT',
           'GGTTGTGTTACAGTTGGGCT','CTCGTGGTTCGATGGGTAGT',
           'TGTAGGTTGCAGGGAGTCAA','GGTCACGGATAAATGAAGGA',]

In [26]:
Final_nc = Raw_df[Raw_df.representative_sequence.isin(nc_list)]
Final_pc = Raw_df[Raw_df.representative_sequence.isin(pc_list)]

In [27]:
Final_df = Raw_df[~Raw_df.representative_sequence.isin(nc_list+pc_list)]
# I request the barcode has 1 dna count in all  three replicate
Final_df_filtered=copy.deepcopy(all_present_total_new(Final_df,1,0,'absolute'))

### 3.3 Calculate expression 

In [31]:
Expression_df = Expression_Preprocessing(Final_df_filtered)
Expression_pc_df = Expression_Preprocessing(Final_pc)
Expression_nc_df= Expression_Preprocessing(Final_nc)

### 3.4 Add promoter information

In [32]:
#First I read the linkage information
df = pd.read_csv(Total_linkage_resolved_address,skiprows=1,header=None, usecols=range(0,6))
df.columns = ['P_Cluster_ID', 'B_Clustser_ID','P_Clustser_seq','representative_sequence','Total_count','Seq_ID'] # rename the columns

In [33]:
Promoter_expression_df = pd.merge(df, Expression_df, on = 'representative_sequence', how = 'inner')

### 3.5 Output

In [34]:
# YPD Output
Expression_df.to_csv(total_output_address, index=False)
Expression_pc_df.to_csv(PC_output_address, index = False)
Expression_nc_df.to_csv(NC_output_address, index = False)

In [35]:
Promoter_expression_df.to_csv(promoter_output_address, index = False)