In [1]:
# Imports libraries to be used in the code below
import pandas as pd
import numpy as np
import random
from Bio import SeqIO
from Bio import Entrez
import pybedtools

In [2]:
ara_tha_ids = []
ara_tha_seqs = []
ara_tha_file = SeqIO.parse('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/araTha1.txt','fasta')

for x in ara_tha_file:
    ara_tha_ids.append(x.id)
    ara_tha_seqs.append(str(x.seq))
    
ara_tha_df = pd.DataFrame({"Promoter ID":ara_tha_ids, "Sequence":ara_tha_seqs})

mouse_mus_ids = []
mouse_mus_seqs = []
mouse_mus_file = SeqIO.parse('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/mm10.txt','fasta')

for y in mouse_mus_file:
    mouse_mus_ids.append(y.id)
    mouse_mus_seqs.append(str(y.seq))

mouse_mus_df = pd.DataFrame({"Promoter ID":mouse_mus_ids, "Sequence":mouse_mus_seqs})

In [3]:
ara_tha_bed = pd.read_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/arabidopsis_epdnew_KZQd3.bed',
                         sep = '\t',header = None)
ara_tha_bed_chr1 = ara_tha_bed[ara_tha_bed[0] == "chr1"]
# This is needed because the bed files only give coordinates of the transcription start site.
# + and - values from transcription start site is based on the properties of EPD database that will
# be explained in the report.
for r in range(len(ara_tha_bed_chr1)):
    transcription_start_a = ara_tha_bed_chr1.iloc[r,1]
    ara_tha_bed_chr1.iloc[r,1] = transcription_start_a - 250
    ara_tha_bed_chr1.iloc[r,2] = transcription_start_a + 100

ara_tha_bed_chr1.to_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/ara_tha_bed_chr1.tsv', sep='\t', index=False)

mouse_mus_bed = pd.read_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/mouse_epdnew_HlytC.bed',
                         sep = '\t',header = None)
mouse_mus_bed_chr1 = mouse_mus_bed[mouse_mus_bed[0] == "chr1"]

for r in range(len(mouse_mus_bed_chr1)):
    transcription_start_m = mouse_mus_bed_chr1.iloc[r,1]
    mouse_mus_bed_chr1.iloc[r,1] = transcription_start_m - 250
    mouse_mus_bed_chr1.iloc[r,2] = transcription_start_m + 100

mouse_mus_bed_chr1.to_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/mouse_mus_bed_chr1.tsv', sep='\t', index=False)

In [4]:
Entrez.email = "b.hyunyi@gmail.com"

ara_tha_id = "NC_003070.9"

ara_handle = Entrez.efetch(db="nucleotide", id=ara_tha_id, rettype="fasta", retmode="text")

ara_tha_ref = ara_handle.read()
ara_handle.close()

ara_tha_ref = ara_tha_ref.split("\n")[1:]

ara_tha_ref = "".join(ara_tha_ref)


In [5]:
## extract non-promoter regions ###
ara_tha_ref_bed = pybedtools.BedTool([("chr1", 0, len(ara_tha_ref))])

non_pro_ara_bed = ara_tha_ref_bed.subtract(
    '/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/ara_tha_bed_chr1.tsv')

non_pro_ara_seqs = []
for x in non_pro_ara_bed:
    start, end = x.start, x.end
    non_pro_ara_seqs.append(ara_tha_ref[start:end])
    
non_pro_ara_seqs_adj = []
for seq in non_pro_ara_seqs:
    if len(seq) >= 350:
        start_index = random.randint(0,len(seq) - 350)
        rand_seq = seq[start_index:start_index + 350]
        if "N" not in rand_seq:
            non_pro_ara_seqs_adj.append(rand_seq)
        
non_pro_ara_df = pd.DataFrame({"Promoter ID":"Non-promoter", "Sequence":non_pro_ara_seqs_adj})

ara_tha_df_final = pd.concat([ara_tha_df, non_pro_ara_df], axis=0, ignore_index=True)

chr1	22884	23234	AT1G01040_1	1	+

chr1	22884	23234	AT1G01040_1	1	+



In [6]:
ara_tha_df_final.to_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/ara_tha_final.csv', index=False)



In [7]:
mouse_mus_id = "NC_000067.7"

mouse_handle = Entrez.efetch(db="nucleotide", id=mouse_mus_id, rettype="fasta", retmode="text")

mouse_mus_ref = mouse_handle.read()
mouse_handle.close()

mouse_mus_ref = mouse_mus_ref.split("\n")[1:]

mouse_mus_ref = "".join(mouse_mus_ref)

In [8]:
## extract non-promoter regions ###
mouse_mus_ref_bed = pybedtools.BedTool([("chr1", 0, len(mouse_mus_ref))])

non_pro_mouse_bed = mouse_mus_ref_bed.subtract(
    '/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/mouse_mus_bed_chr1.tsv')

non_pro_mouse_seqs = []
for x in non_pro_mouse_bed:
    start, end = x.start, x.end
    non_pro_mouse_seqs.append(mouse_mus_ref[start:end])
    
non_pro_mouse_seqs_adj = []
for seq in non_pro_mouse_seqs:
    if len(seq) >= 350:
        start_index = random.randint(0,len(seq) - 350)
        rand_seq = seq[start_index:start_index + 350]
        if "N" not in rand_seq:
            non_pro_mouse_seqs_adj.append(rand_seq)
        
non_pro_mouse_df = pd.DataFrame({"Promoter ID":"Non-promoter", "Sequence":non_pro_mouse_seqs_adj})

mouse_mus_df_final = pd.concat([mouse_mus_df, non_pro_mouse_df], axis=0, ignore_index=True)

chr1	4360014	4360364	Rp1_2	1	-

chr1	4360014	4360364	Rp1_2	1	-



In [9]:
mouse_mus_df_final.to_csv('/Users/b.hyunyi/Desktop/Machine Learning Modules/Final_project/mouse_mus_final.csv', index=False)
