## From here, the script determine motif presence in shuffled sequence-assigned .bed files and makes a .tsv to store the info

### for TGCATG

In [2]:
import os
cwd = os.getcwd()
main_dir = cwd
print(main_dir)

/tscc/lustre/ddn/scratch/q2liang/isSTAMP_publication_scripts/example_analysis_notebooks


In [3]:
shuffled_dir = main_dir + '/shuffled_confident_clusters/'

In [4]:
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}

def reverse_complement(seq):
    return "".join(complement.get(base, base) for base in reversed(seq))

def reverse(seq):
    return seq[::-1]

def motif_present(sequence, motif, strand):
    if strand == '+':
        return motif in sequence
    elif strand == '-':
        return reverse_complement(motif) in sequence

def motif_distance_from_center(sequence, motif, strand):
    if strand == '+':
        search_term = motif
    elif strand == '-':
        search_term = reverse_complement(motif)
    
    if search_term in sequence:
        closest_position_in_first_half = 1000
        closest_position_in_second_half = 1000
        
        midpoint = math.ceil(len(sequence)/2)
        start_of_second_half = midpoint-len(motif)

        sequence_first_half = sequence[0:midpoint]
        sequence_second_half = sequence[start_of_second_half:]

        if search_term in sequence_first_half:
            flipped_sequence = reverse(sequence_first_half)
            flipped_search_term = reverse(search_term)
            
            closest_position_in_first_half = - flipped_sequence.index(flipped_search_term) - math.floor(len(motif)/2)
            
        if search_term in sequence_second_half:
            closest_position_in_second_half = sequence_second_half.index(search_term) - math.floor(len(motif)/2)

        if min(abs(closest_position_in_second_half), abs(closest_position_in_first_half)) == abs(closest_position_in_second_half):
            return closest_position_in_second_half
        else:
            return closest_position_in_first_half
    else:
        return None
    
motifs = ['TGCATG']

def add_sequence_presences(r, window_size=70):
    extended_sequence = r['extended_sequence(+)']
    strand = r.strand

    distance = motif_distance_from_center(extended_sequence, 'TGCATG', strand)
    
    sequence = r['sequence(+)']

    presence_map = {motif: motif_present(sequence, motif, strand) for motif in motifs}
    return presence_map.get(motifs[0]), distance

def calculate_fractions(p_df):
    fractions_dict = {}
    counts_dict = {}
    
    for motif in motifs:
        motif_present_count = p_df[motif].sum()
        motif_present_fraction = motif_present_count/len(p_df)
        fractions_dict[motif] = motif_present_fraction
        counts_dict[motif] = motif_present_count
    
    return fractions_dict, counts_dict

In [5]:
from glob import glob

shuffled_peak_with_sequence_filepaths = glob(shuffled_dir +'/*/*with_sequence.bed')
shuffled_peak_with_sequence_filepaths = [i for i in shuffled_peak_with_sequence_filepaths if 'TDP43' not in i]
len(shuffled_peak_with_sequence_filepaths)

40

In [6]:
import pandas as pd
import math

all_fractions = pd.DataFrame()
all_counts = pd.DataFrame()

num_peak_per_sample_id = {}

for filepath in shuffled_peak_with_sequence_filepaths:
    # Extract sample ID from filepath
    sample_id = filepath.split('/')[-1].split('.with_sequence.bed')[0]
    folder = filepath.split(sample_id)[0]

    print('\t', folder, sample_id)

    output_filename = '{}/{}.motif_presence.bed'.format(folder, sample_id)
    print('\t\t', output_filename)

    print('\t...Reading...')
    p_df = pd.read_csv(filepath, sep='\t')
    num_peaks = len(p_df)
    num_peak_per_sample_id[sample_id] = num_peaks

    # Calculate sequence presence
    #p_df[motifs[0]],p_df[motifs[1]],p_df[motifs[2]],p_df[motifs[3]],p_df[motifs[4]],p_df[motifs[5]],p_df[motifs[6]],p_df['GCATG_dist_from_center'] = zip(*p_df.apply(add_sequence_presences, axis=1))
    p_df[motifs[0]],p_df['TGCATG_dist_from_center'] = zip(*p_df.apply(add_sequence_presences, axis=1))

    print('\tOutputting file {}'.format(output_filename))

    #p_df['any_motif'] = p_df[[motifs[0], motifs[1], motifs[2], motifs[3], motifs[4], motifs[5], motifs[6]]].any(axis=1)

    p_df.to_csv(output_filename, sep='\t', index=False, header=True)

    new_fractions_dict, new_counts_dict = calculate_fractions(p_df)

    new_fractions = pd.DataFrame.from_dict(new_fractions_dict, orient='index', columns=[sample_id])
    new_counts = pd.DataFrame.from_dict(new_counts_dict, orient='index', columns=[sample_id])    

    if all_fractions.empty:
        all_fractions = new_fractions
    else:
        all_fractions = all_fractions.join(new_fractions, how='inner')

    if all_counts.empty:
        all_counts = new_counts
    else:
        all_counts = all_counts.join(new_counts, how='inner')
    print('\t', len(all_fractions.columns))

all_fractions[sorted(all_fractions.columns)].to_csv(shuffled_dir + 'RBFOX_cleaned_motif_presence_in_shuffles.tsv', sep='\t')



	 /tscc/lustre/ddn/scratch/q2liang/isSTAMP_publication_scripts/example_analysis_notebooks/shuffled_confident_clusters/Enzyme_Only_PFA/ Enzyme_Only_PFA_shuffle2
		 /tscc/lustre/ddn/scratch/q2liang/isSTAMP_publication_scripts/example_analysis_notebooks/shuffled_confident_clusters/Enzyme_Only_PFA//Enzyme_Only_PFA_shuffle2.motif_presence.bed
	...Reading...
	Outputting file /tscc/lustre/ddn/scratch/q2liang/isSTAMP_publication_scripts/example_analysis_notebooks/shuffled_confident_clusters/Enzyme_Only_PFA//Enzyme_Only_PFA_shuffle2.motif_presence.bed
	 1
	 /tscc/lustre/ddn/scratch/q2liang/isSTAMP_publication_scripts/example_analysis_notebooks/shuffled_confident_clusters/Enzyme_Only_PFA/ Enzyme_Only_PFA_shuffle19
		 /tscc/lustre/ddn/scratch/q2liang/isSTAMP_publication_scripts/example_analysis_notebooks/shuffled_confident_clusters/Enzyme_Only_PFA//Enzyme_Only_PFA_shuffle19.motif_presence.bed
	...Reading...
	Outputting file /tscc/lustre/ddn/scratch/q2liang/isSTAMP_publication_scripts/example_anal

In [7]:
all_fractions.transpose()

Unnamed: 0,TGCATG
Enzyme_Only_PFA_shuffle2,0.014075
Enzyme_Only_PFA_shuffle19,0.017426
Enzyme_Only_PFA_shuffle1,0.014075
Enzyme_Only_PFA_shuffle4,0.017426
Enzyme_Only_PFA_shuffle14,0.013405
Enzyme_Only_PFA_shuffle17,0.014745
Enzyme_Only_PFA_shuffle8,0.012064
Enzyme_Only_PFA_shuffle18,0.014745
Enzyme_Only_PFA_shuffle11,0.014745
Enzyme_Only_PFA_shuffle9,0.013405
