## 1. Modules and Functions

### 1.1 Modules

In [1]:
from Bio import motifs
from Bio.Seq import Seq
import scipy.stats
import numpy as np
import statsmodels.stats.multitest as STM
import glob
import pandas as pd
import json
import copy
import re
import matplotlib.pyplot as plt
from matplotlib import gridspec
plt.rcParams["font.family"] = "Arial"
plt.rcParams['pdf.fonttype'] = 42

### 1.2 Functions

In [2]:
def Add_Sequence_Info(input_df):
    temp_df = input_df[['representative_sequence','P_Cluster_seq', 'expression_normalized', 'Mean_normalized_relative']].copy()
    temp_df['GC_content'] = temp_df['P_Cluster_seq'].apply(lambda x:len(re.findall("[GgCc]", x))/len(x))
    temp_df['TATA_box'] = temp_df['P_Cluster_seq'].apply(lambda x: find_tata_box(str(x)))
    # Consider barcode promoter and the intergenic region
    temp_df['GC_content_PB'] = (temp_df['representative_sequence']+
                                'TGGCTGAACCTAGTTTTGCCC'+
                                temp_df['P_Cluster_seq']).apply(lambda x:len(re.findall("[GgCc]", x))/len(x))
    # Consider barcode promoter and the intergenic region
    temp_df['TATA_box_PB'] = (temp_df['representative_sequence']+
                              'TGGCTGAACCTAGTTTTGCCC'+
                              temp_df['P_Cluster_seq']).apply(lambda x: find_tata_box(str(x)))
    return(temp_df)

In [3]:
instances = ["TATAAAAA", "TATAAAAG", "TATAAATA",
             "TATAAATG", "TATATAAA", "TATATAAG",
             "TATATATA", "TATATATG"]


def find_tata_box(input):
    a = 0
    for x in instances:
        a += input.count(x)
    return a

In [4]:
def generate_background_ratio(input_string):
    # this calculate ratio for both strands
    temp_A = input_string.count('A')
    temp_T = input_string.count('T')
    temp_G = input_string.count('G')
    temp_C = input_string.count('C')
    temp_sum = temp_A + temp_T + temp_G + temp_C
    temp_AT = (temp_A+temp_T)/2/temp_sum
    temp_GC = (temp_G+temp_C)/2/temp_sum
    return({'A': temp_AT,'T': temp_AT,'C': temp_GC,'G': temp_GC})

In [20]:
def find_motif_new(input_short_df, input_pssm_dic, input_cutoff_dic):
# this will find motif seperately
# This function take a promoter df, a pssm dictionary and a cutoff dictionary as input
# It output a new df with additional columns corresponding to each motif in the pssm dictionary 
# and a total number of motif found in each promoter column
# I will store the position of motif as a binary number string, for example, if the first position has a motif, it will be 1.
    test_seq_list = input_short_df['P_Cluster_seq'].tolist()
    total_array = []
    total_array_f = []
    total_array_r = []
    total_array_f_position = []
    total_array_r_position = []
    for temp_seq in test_seq_list:
        temp_array_total = [] # This is a two D array which store all the score list of all motif for a give seq
        temp_array_f = []
        temp_array_r = []
        temp_f_position = [] # list storing the position information
        temp_r_position = [] # list storing the position information
        for temp_name,temp_motif in input_pssm_dic.items():
            temp_f_score = temp_motif.calculate(temp_seq) # calculate PSSM scores on forward strand
            temp_r_score = temp_motif.reverse_complement().calculate(temp_seq)
            temp_score_total = np.concatenate([temp_f_score,temp_r_score]) # I concatenate two 1D array
            temp_cutoff = input_cutoff_dic[temp_name]
            temp_p = ''.join([str(x) for x in (temp_f_score>temp_cutoff)*1]) # generate a 0/1 string
            temp_f_position.append(temp_p) # treat that string as a binary number and convert it into a decimal number
            temp_p = ''.join([str(x) for x in (temp_r_score>temp_cutoff)*1]) # generate a 0/1 string
            temp_r_position.append(temp_p) # treat that string as a binary number and convert it into a decimal number
            temp_array_f.append(sum(temp_f_score>temp_cutoff))
            temp_array_r.append(sum(temp_r_score>temp_cutoff))
            temp_array_total.append(sum(temp_score_total>temp_cutoff))
        total_array.append(temp_array_total)
        total_array_f.append(temp_array_f)
        total_array_r.append(temp_array_r)
        total_array_f_position.append(temp_f_position)
        total_array_r_position.append(temp_r_position)
    total_array = np.array(total_array)
    total_array_f = np.array(total_array_f)
    total_array_r = np.array(total_array_r)
    total_array_f_position = np.array(total_array_f_position)
    total_array_r_position = np.array(total_array_r_position)
    temp_col_names = list(input_pssm_dic.keys())+[x+'_f' for x in list(input_pssm_dic.keys())]+[x+'_r' for x in list(input_pssm_dic.keys())]
    output_df = pd.DataFrame(np.c_[total_array,total_array_f,total_array_r], columns = temp_col_names,index = input_short_df.index)
    output_df[[x+'_f_position' for x in list(input_pssm_dic.keys())]+[x+'_r_position' for x in list(input_pssm_dic.keys())]] = np.c_[total_array_f_position,total_array_r_position]
    output_df ['Total_motif_count'] = np.sum(total_array,axis=1)
    output_df ['Total_motif_count_f'] = np.sum(total_array_f,axis=1)
    output_df ['Total_motif_count_r'] = np.sum(total_array_r,axis=1)
    return(output_df)

---

## 1. Input and Output

### 1.1 Input

In [1]:
dr1 = 'Data/'
# barcode with promoter and have at least one DNA read in each rep
YPD_promoter_address = dr1 + 'YPD_promoter_output.csv'
SCD_promoter_address = dr1 + 'SCD_promoter_output.csv'

# motif position weight matrics
motif_pssm_dic_address = dr1 + 'motif_pssm_dic'

# cutoff for each motif curated from ScerTF
motif_cutoff_df = dr1+'recommended.cutoffs.curated'

In [23]:
# summary data for RD compared to control
YPD_summary_address = dr1 + 'YPD_RD_summary_median_total_extra.csv'
SCD_summary_address = dr1 + 'SCD_RD_summary_median_total_extra.csv'

## 2 Add GC content and TATA box information

In [7]:
# promoter complete information
YPD_promoter_df = pd.read_csv(dr1 + 'YPD_promoter_output.csv')
SCD_promoter_df = pd.read_csv(dr1 + 'SCD_promoter_output.csv')

In [8]:
YPD_promoter_df = YPD_promoter_df.rename(columns={
    'P_Clustser_seq': 'P_Cluster_seq'})
SCD_promoter_df = SCD_promoter_df.rename(columns={
    'P_Clustser_seq': 'P_Cluster_seq'})

In [9]:
YPD_short = Add_Sequence_Info(YPD_promoter_df)

In [10]:
SCD_short = Add_Sequence_Info(SCD_promoter_df)

### Output

In [11]:
# SCD_short.to_csv(dr1+'SCD_short_output.csv')
# YPD_short.to_csv(dr1+'YPD_short_output.csv')

## Generate Position Weight Matric

### 4.1 cutoff Read Data 

In [12]:
# Read cutoff for each promoter
PCM_cutoff_df = pd.read_csv(motif_cutoff_df,
                            sep='\t',
                            names=['Gene_name', 'Paper',
                                   'Alternative_name',
                                   'ChIP-chip', 'Recommended_PWM_Cutoff',
                                   'Fisher_value'])

In [13]:
PCM_cutoff_df_dic ={}
for index, row in PCM_cutoff_df.iterrows():
    PCM_cutoff_df_dic[row['Gene_name']] = row['Recommended_PWM_Cutoff']

### 4.2 PSSM

In [14]:
# Read pcm 
PCM_new_addres = glob.glob(dr1+'PCM_new/*')

In [15]:
motif_dic = {} # a dictionary: key is the TF name and value is the motif object 
for x in PCM_new_addres:
    # temp_x = x.split('PCM_new\\')[1].split('.')[1]
    temp_x = x.split('PCM_new/')[1].split('.')[1]
    with open(x,'r') as handler:
        temp_motif = motifs.read(handler, "pfm")
    motif_dic[temp_x] = temp_motif   

#### Generate background distribution

In [16]:
# To calculate ATGC content, I combine all the promoter sequence into a long string
SCD_total_promoter_string = ''.join(SCD_short['P_Cluster_seq'])

In [17]:
# background will be a dictionary
SCD_background = generate_background_ratio(SCD_total_promoter_string)

* **<font color=blue>Nucleotide content are very similar in three promoter library, so I just picked SCD one</font>**

#### Convert pfm to pssm (Position-Specific Scoring Matrices)

In [18]:
pseudocounts = {}
for key,value in SCD_background.items():
    pseudocounts[key] = value*2

In [19]:
motif_pssm_dic = {} # a dictionary for storing position-specific-scoring matrices
for key,value in motif_dic.items():
    temp_pssm = value.counts.normalize(pseudocounts).log_odds(SCD_background)
    motif_pssm_dic[key] = temp_pssm

## 5 Find TF binding in our promoters

### 5.1 Filter promoter's based on summary data

In [None]:
# Read gene expression from summary the csv file
SCD_summary = pd.read_csv(SCD_summary_address)
YPD_summary = pd.read_csv(YPD_summary_address)

In [None]:
# filter summary data based on percentage of PC, RNA count and DNA count
temp_cut_control = 1.0
temp_cut_RNA = 0
temp_cut_DNA = 100
temp_df_1 = SCD_summary[
    (SCD_summary['Percentile_of_PC'] == temp_cut_control) & (
        SCD_summary['RNA_cut_off'] == temp_cut_RNA) & (
        SCD_summary['DNA_cut_off'] == temp_cut_DNA)]
temp_df_2 = YPD_summary[
    (YPD_summary['Percentile_of_PC'] == temp_cut_control) & (
        YPD_summary['RNA_cut_off'] == temp_cut_RNA)&(
        YPD_summary['DNA_cut_off'] == temp_cut_DNA)]

### 5.2 Search motif match

In [None]:
SCD_motif_df = find_motif_new(SCD_short, motif_pssm_dic, PCM_cutoff_df_dic)

In [None]:
YPD_motif_df = find_motif_new(YPD_short, motif_pssm_dic, PCM_cutoff_df_dic)

In [40]:
# test in only the more than negative control part
temp_input1 = temp_df_1
temp_input2 = SCD_short
# temp_cut = list(np.linspace(0,95,20)/100)
# [0,0.2,0.4,0.6,0.8,1]
temp_filtered = temp_input2.loc[set(ast.literal_eval(temp_input1['barcode_better_than_nc_list'].values[0]))&set(temp_input2.index)]
SCD_motif_df_sub  = find_motif_new(temp_filtered, motif_pssm_dic, PCM_cutoff_df_dic)



In [41]:
# test in only the more than negative control part
temp_input1 = temp_df_2
temp_input2 = YPD_short
# temp_cut = list(np.linspace(0,95,20)/100)
# [0,0.2,0.4,0.6,0.8,1]
temp_filtered = temp_input2.loc[set(ast.literal_eval(temp_input1['barcode_better_than_nc_list'].values[0]))&set(temp_input2.index)]
YPD_motif_df_sub  = find_motif_new(temp_filtered, motif_pssm_dic, PCM_cutoff_df_dic)



In [42]:
# # Output 
# SCD_motif_df.to_csv(dr1 + 'SCD_motif_df.csv', index=True)
# YPD_motif_df.to_csv(dr1 + 'YPD_motif_df.csv', index=True)

In [43]:
# SCD_motif_df = pd.read_csv(dr1 + 'SCD_motif_df.csv', index_col = 0)
# YPD_motif_df = pd.read_csv(dr1 + 'YPD_motif_df.csv', index_col = 0)

In [67]:
# Combine data
SCD_promoter_df_final = pd.merge(SCD_short, SCD_motif_df, left_index=True, right_index=True)
YPD_promoter_df_final = pd.merge(YPD_short, YPD_motif_df, left_index=True, right_index=True)

In [68]:
# Combine data
SCD_sub_df_final = pd.merge(SCD_short, SCD_motif_df_sub, left_index=True, right_index=True)
YPD_sub_df_final = pd.merge(YPD_short, YPD_motif_df_sub , left_index=True, right_index=True)

In [50]:
# Output

588

In [71]:
# SCD_promoter_df_final.to_csv(dr1 + 'SCD_ScerTF_motif_df_final.csv', index=True)
# YPD_promoter_df_final.to_csv(dr1 + 'YPD_ScerTF_motif_df_final.csv', index=True)

In [72]:
# SCD_sub_df_final.to_csv(dr1 + 'SCD_sub_ScerTF_motif_df_final.csv', index=True)
# YPD_sub_df_final.to_csv(dr1 + 'YPD_sub_ScerTF_motif_df_final.csv', index=True)

In [None]:
# # read motif information
# SCD_promoter_df_final = pd.read_csv(dr1 + 'SCD_ScerTF_motif_df_final.csv', index_col =0)
# YPD_promoter_df_final = pd.read_csv(dr1 + 'YPD_ScerTF_motif_df_final.csv', index_col =0)

### 5.3 Summarize fold change for each gene by rank sum test

In [55]:
SCD_promoter_df_final.head()

Unnamed: 0,P_Cluster_seq,expression_normalized,Mean_normalized_relative,GC_content,TATA_box,GC_content_PB,TATA_box_PB,ABF2,CAT8,CST6,...,TEA1_r_position,TOD6_r_position,YKL222C_r_position,YLL054C_r_position,YPR013C_r_position,YPR015C_r_position,YRM1_r_position,Total_motif_count,Total_motif_count_f,Total_motif_count_r
GTACACGACTGAGTTGCGGG,TCTTTTGATGTTTCAGAATGTTAGAACGGGCCAGGTCGTAGGATGT...,1.135523,1.31173,0.45,0,0.478261,0,0,0,1,...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,42,20,22
CTGCGTTTGGAAGGGTACAA,CAGTAGGGGCGCACGGCTGCAGTAAGATTCAAGTTTTATTGCTTAT...,0.880042,0.858114,0.358974,0,0.398734,0,0,0,0,...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,60,32,28
GGTACCTGGGCAGGCAAGAG,GTATGAGTTAATGGCAGTGAAACTGTAGGATGTTTGTGCCGCAATA...,0.0,0.0,0.441667,0,0.478261,0,0,0,1,...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,31,15,16
TATAATTGGTAAGTTAATCC,GGTTTTCGATGCATCTGTGTTTGTGTGAAGTTTGATCGAGCGAAAA...,0.553302,0.523018,0.416667,0,0.409938,0,0,0,1,...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,57,31,26
TAAAGCGTGGGTCAGGGGGT,CGGATTTCCGGCACGCAGGAATTGAAGATTCAGGTTATTGGGTTGT...,1.177114,1.533146,0.441667,0,0.47205,0,0,2,1,...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0100000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0100000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,53,25,28


In [56]:
temp_name = list(PCM_cutoff_df_dic.keys())+[x+'_f' for x in list(PCM_cutoff_df_dic.keys())]+[x+'_r' for x in list(PCM_cutoff_df_dic.keys())]
SCD_fold_df = find_fold(SCD_promoter_df_final,temp_name)
YPD_fold_df = find_fold(YPD_promoter_df_final,temp_name)

  z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)


In [57]:
SCD_sub_fold_df = find_fold(SCD_sub_df_final,temp_name)
YPD_sub_fold_df = find_fold(YPD_sub_df_final,temp_name)

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [57]:
# # read promoter complete information
# YPD_promoter_df = pd.read_csv(dr1 + 'YPD_promoter_output.csv', index_col =0)
# SCD_promoter_df = pd.read_csv(dr1 + 'SCD_promoter_output.csv', index_col =0)

In [73]:
SCD_fold_df.to_csv(dr1 + 'SCD_ScerTF_fold_df.csv',index =True)
YPD_fold_df.to_csv(dr1 + 'YPD_ScerTF_fold_df.csv',index =True)

In [74]:
SCD_sub_fold_df.to_csv(dr1 + 'SCD_ScerTF_sub_fold_df.csv',index =True)
YPD_sub_fold_df.to_csv(dr1 + 'YPD_ScerTF_sub_fold_df.csv',index =True)

### 5.4 More strigent filter on DNA reads

In [18]:
# The above analysis include promoter that at least have one DNA count in each dataset
# next I am going to use a more stringent filter

In [26]:
def all_present_total_new(input_df,cutoff_DNA,cutoff_RNA):
    temp_df=input_df[(input_df['DNA_absolute_count_S1']>=cutoff_DNA)&
                     (input_df['DNA_absolute_count_S2']>=cutoff_DNA)&
                     (input_df['DNA_absolute_count_S3']>=cutoff_DNA)&
                     (input_df['RNA_absolute_count_S1']>=cutoff_RNA)&
                     (input_df['RNA_absolute_count_S2']>=cutoff_RNA)&
                     (input_df['RNA_absolute_count_S3']>=cutoff_RNA)]
    return(temp_df)

In [32]:
temp_name = list(PCM_cutoff_df_dic.keys())+[x+'_f' for x in list(PCM_cutoff_df_dic.keys())]+[x+'_r' for x in list(PCM_cutoff_df_dic.keys())]
temp_input1 = YPD_promoter_df # this is with DNA RNA count information
temp_input2 = YPD_promoter_df_final # this is with promoter and motif information
index_of_interest = all_present_total_new(temp_input1,100,0).index # filter out only those promoter with at least 100 DNA reads in each replicate
YPD_fold_df_D100 = find_fold(temp_input2.loc[index_of_interest],temp_name)

  z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)


In [35]:
YPD_fold_df_D100.to_csv(dr1 + 'YPD_ScerTF_fold_df_D100.csv',index =True)

In [33]:
temp_name = list(PCM_cutoff_df_dic.keys())+[x+'_f' for x in list(PCM_cutoff_df_dic.keys())]+[x+'_r' for x in list(PCM_cutoff_df_dic.keys())]
temp_input1 = SCD_promoter_df # this is with DNA RNA count information
temp_input2 = SCD_promoter_df_final # this is with promoter and motif information
index_of_interest = all_present_total_new(temp_input1,100,0).index # filter out only those promoter with at least 100 DNA reads in each replicate
SCD_fold_df_D100 = find_fold(temp_input2.loc[index_of_interest],temp_name)

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [34]:
SCD_fold_df_D100.to_csv(dr1 + 'SCD_ScerTF_fold_df_D100.csv',index =True)