In [1]:
# Imports libraries to be used in the code below
import pandas as pd
import numpy as np
import random
from Bio import SeqIO
from Bio import Entrez
import pybedtools

In [2]:
def generate_chr_bed(bed_file,chromosome,species_bed_name):
    bed_chr_file = bed_file[bed_file[0] == chromosome]
    # This is needed because the bed files only give coordinates of the transcription start site.
    # + and - values from transcription start site is based on the properties of EPD database that will
    # be explained in the report.
    for r in range(len(bed_chr_file)):
        transcription_start_a = bed_chr_file.iloc[r,1]
        bed_chr_file.iloc[r,1] = transcription_start_a - 250
        bed_chr_file.iloc[r,2] = transcription_start_a + 100

    file_words = ['/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/',
                  species_bed_name,chromosome,'.tsv']
    file_name = "".join(file_words)
    bed_chr_file.to_csv(file_name, sep='\t', index=False)

    

In [3]:
def find_reference_seq(chr_id):
    handle = Entrez.efetch(db="nucleotide", id=chr_id, rettype="fasta", retmode="text")

    ref_seq = handle.read()
    handle.close()

    ref_seq = ref_seq.split("\n")[1:]

    ref_seq = "".join(ref_seq)
    return ref_seq

In [4]:
def find_non_pro_seqs(ref_seqs,species_bed_name):
    
    non_pro_seqs = []
    for index, seq in enumerate(ref_seqs):
        chr_words = ["chr",str(index+1)]
        chr_label = "".join(chr_words)
        ref_bed = pybedtools.BedTool([(chr_label, 0, len(seq))])
        
        file_words = ['/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/',
                      species_bed_name,chr_label,".tsv"]
        file_name = "".join(file_words)
        non_pro_bed = ref_bed.subtract(file_name)
        
        for x in non_pro_bed:
            start, end = x.start, x.end
            non_pro_seqs.append(seq[start:end])
    
    non_pro_seqs_adj = []
    for seq in non_pro_seqs:
        if len(seq) >= 351:
            start_index = random.randint(0,len(seq) - 351)
            rand_seq = seq[start_index:start_index + 351]
            if "N" not in rand_seq:
                non_pro_seqs_adj.append(rand_seq)
                
    non_pro_df = pd.DataFrame({"Promoter ID":"Non-promoter", "Sequence":non_pro_seqs_adj})
                
    return non_pro_df
        
        

In [5]:
ara_tha_ids = []
ara_tha_seqs = []
ara_tha_file = SeqIO.parse('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/araTha1.txt','fasta')

for x in ara_tha_file:
    ara_tha_ids.append(x.id)
    ara_tha_seqs.append(str(x.seq))
    
ara_tha_df = pd.DataFrame({"Promoter ID":ara_tha_ids, "Sequence":ara_tha_seqs})

mouse_mus_ids = []
mouse_mus_seqs = []
mouse_mus_file = SeqIO.parse('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/mm10.txt','fasta')

for y in mouse_mus_file:
    mouse_mus_ids.append(y.id)
    mouse_mus_seqs.append(str(y.seq))

mouse_mus_df = pd.DataFrame({"Promoter ID":mouse_mus_ids, "Sequence":mouse_mus_seqs})

human_ids = []
human_seqs = []
human_file = SeqIO.parse('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/hg38.txt','fasta')

for z in human_file:
    human_ids.append(z.id)
    human_seqs.append(str(z.seq))

human_df = pd.DataFrame({"Promoter ID":human_ids, "Sequence":human_seqs})

In [6]:
ara_tha_bed = pd.read_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/arabidopsis_epdnew_KZQd3.bed',
                         sep = '\t',header = None)

num_chr_ara = 5
ara_bed_name = "ara_tha_bed_"

for x in list(range(num_chr_ara)):
    chr_words_x = ["chr",str(x+1)]
    chr_label_x = "".join(chr_words_x)
    generate_chr_bed(ara_tha_bed,chr_label_x,ara_bed_name)

mouse_mus_bed = pd.read_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/mouse_epdnew_HlytC.bed',
                         sep = '\t',header = None)

num_chr_mouse = 19
mouse_bed_name = "mouse_mus_bed_"

for y in list(range(num_chr_mouse)):
    chr_words_y = ["chr",str(y+1)]
    chr_label_y = "".join(chr_words_y)
    generate_chr_bed(mouse_mus_bed,chr_label_y,mouse_bed_name)
    
human_bed = pd.read_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/human_epdnew_Lyu0l.bed',
                         sep = '\t',header = None)

num_chr_human = 20
human_bed_name = "human_bed_"

for z in list(range(num_chr_human)):
    chr_words_z = ["chr",str(z+1)]
    chr_label_z = "".join(chr_words_z)
    generate_chr_bed(human_bed,chr_label_z,human_bed_name)


In [7]:
Entrez.email = "b.hyunyi@gmail.com"

In [None]:
# Arabidopsis thaliana references
ara_tha_ids = ["NC_003070.9", "NC_003071.7", "NC_003074.8", "NC_003075.7", "NC_003076.8"]
ara_tha_ref = [find_reference_seq(seq_id) for seq_id in ara_tha_ids]

# Mouse (Mus musculus) references
mouse_mus_ids = ["NC_000067.7", "NC_000068.8", "NC_000069.7", "NC_000070.7", "NC_000071.7","NC_000072.7",
                "NC_000073.7","NC_000074.7","NC_000075.7","NC_000076.7","NC_000077.7","NC_000078.7","NC_000079.7",
                "NC_000080.7","NC_000081.7","NC_000082.7","NC_000083.7","NC_000084.7","NC_000085.7"]
mouse_mus_ref = [find_reference_seq(seq_id) for seq_id in mouse_mus_ids]

In [8]:
# Human references
human_ids = ["NC_000001.11", "NC_000002.12", "NC_000003.12","NC_000004.12","NC_000005.10","NC_000006.12",
            "NC_000007.14","NC_000008.11","NC_000009.12","NC_000010.11","NC_000011.10","NC_000012.12",
            "NC_000013.11","NC_000014.9","NC_000015.10","NC_000016.10","NC_000017.11","NC_000018.10",
            "NC_000019.10","NC_000020.11"]
human_ref = [find_reference_seq(seq_id) for seq_id in human_ids]

In [None]:
non_pro_ara_df = find_non_pro_seqs(ara_tha_ref,"ara_tha_bed_")
ara_tha_df_final = pd.concat([ara_tha_df, non_pro_ara_df], axis=0, ignore_index=True)

non_pro_mouse_df = find_non_pro_seqs(mouse_mus_ref,"mouse_mus_bed_")
mouse_mus_df_final = pd.concat([mouse_mus_df, non_pro_mouse_df], axis=0, ignore_index=True)

In [9]:
non_pro_human_df = find_non_pro_seqs(human_ref,"human_bed_")
human_df_final = pd.concat([human_df, non_pro_human_df], axis=0, ignore_index=True)

chr1	959005	959355	NOC2L_1	1	-

chr1	959005	959355	NOC2L_1	1	-

chr2	46606	46956	FAM110C_1	1	-

chr2	46606	46956	FAM110C_1	1	-

chr3	196345	196695	CHL1_3	1	+

chr3	196345	196695	CHL1_3	1	+

chr4	53070	53420	ZNF595_1	1	+

chr4	53070	53420	ZNF595_1	1	+

chr5	91917	92267	PLEKHG4B_1	1	+

chr5	91917	92267	PLEKHG4B_1	1	+

chr6	292272	292622	DUSP22_2	1	+

chr6	292272	292622	DUSP22_2	1	+

chr7	192635	192985	FAM20C_1	1	+

chr7	192635	192985	FAM20C_1	1	+

chr8	231798	232148	ZNF596_2	1	+

chr8	231798	232148	ZNF596_2	1	+

chr9	178802	179152	CBWD1_1	1	-

chr9	178802	179152	CBWD1_1	1	-

chr10	134233	134583	ZMYND11_3	1	+

chr10	134233	134583	ZMYND11_3	1	+

chr11	192830	193180	SCGB1C1_1	1	+

chr11	192830	193180	SCGB1C1_1	1	+

chr12	210263	210613	SLC6A12_3	1	-

chr12	210263	210613	SLC6A12_3	1	-

chr13	19181531	19181881	TUBA3C_1	1	-

chr13	19181531	19181881	TUBA3C_1	1	-

chr14	20333028	20333378	CCNB1IP1_2	1	-

chr14	20333028	20333378	CCNB1IP1_2	1	-

chr15	20940187	20940537	RP11-294C11_1	1	+

chr15	20940

In [None]:
ara_tha_df_final.to_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/ara_tha_final.csv', index=False)

mouse_mus_df_final.to_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/mouse_mus_final.csv', index=False)


In [12]:
human_df_final.to_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/human_final.csv', index=False)


In [13]:
human_df

Unnamed: 0,Promoter ID,Sequence
0,FP007175,CGGACCTCTGGTGGACATGGCTGCTTCCCGCTTACCCCCAGCGACG...
1,FP016091,TACTGAGATAAATAGACCAAACGAAATTTGGGAGGTTAACTTAACT...
2,FP006261,GATTGCTTATCCCTTAGTACAGGGCTTAATAGATATTTGTTATTAA...
3,FP001986,CGGGTTCCGGTGGGAGCCCCAACTCTGGACCGCGATTCGCGAGCCT...
4,FP011779,GTTATTTTTGTTAATCTCTTACTGTGCCTAATTTATAAATTAAACT...
...,...,...
21066,FP015206,CCACAGGGCTCTCCACTGCCTCAGTCCCCGCCCTCCCTGCCGTCTC...
21067,FP011205,TCGGGCCCCAAAGAACCCTGGAGACCCTCAACCAGGACACAGGTGG...
21068,FP006214,GTCTTATATCCAAGGTGCTACGAAGCTCACTCTGAACCCTGCACCC...
21069,FP008231,GCGCCTAGTCCAGCCCTCCCGTCCCAGGGCCCCGCAGCACGCTGGG...


In [11]:
non_pro_human_df

Unnamed: 0,Promoter ID,Sequence
0,Non-promoter,CATTAGATTTCACCAAGATGTCTTGCTTGTGGGAAAGACTTCCAAG...
1,Non-promoter,GCCGCGGGGCCCCGGGGCTCCCCGGAGGAGAGCAAGTTAGGGGGTC...
2,Non-promoter,ATCCCTTCCTGCAGCCAGGGGCTCACCCCGCCTTCCCCCCAGGAGC...
3,Non-promoter,CTGTCTGTTCACATTCCTGGGCCTGCTCAGGGGCAGCTAAGCTTGG...
4,Non-promoter,GTGGCTTAAGCACCACCTGCCTATTCTTTCCGTAGTGGGGATCAGG...
...,...,...
15849,Non-promoter,GCCAGCCAGGGAAGTGGGGGTGGCAGGGGTGGGGGGCACCCTGCTC...
15850,Non-promoter,GCTCCTGTCCTCCCCAGCACAAGCACTCCACCCGGATAGTTCCCAG...
15851,Non-promoter,CTTTCCTCCTCCTCCCCATCCCAACCCCTCATCCTCCCTGTCTCTG...
15852,Non-promoter,TCAGTTTTGTGCATCACAAACGCACATGCTCCCTCAGTTTTGGGCA...
