In [1]:
from pyliftover import LiftOver
import csv
import os

# lo = LiftOver('hg19','hg38')
lo = LiftOver('hg38','hg19')

In [4]:
def create_data_dictionary(file):
    """
    Convert .bed file to data dictionary
    
    Parameters:
        file (.bed file): each row is chrom_number \t start \t end

    Returns:
        chroms_dict (dict): each key is a chromosome (str) -> 'chr1', 'chr2', ...., 'chrX'
                            each value is list of points -> even and odd indexed points are start and end of active region
    """
    #Create empty dictionary of length 23: {chrom_number: [empty list]}
    chroms_names = ['chr'+str(i) for i in range(1,23)] + ['chrX']
    chroms_dict = {chrom_name:[] for chrom_name in chroms_names}

    #open .bed file
    with open(file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            split_line = line.split('\t') #chrom_number, start, end

            chrom_name = split_line[0]

            if chrom_name in chroms_dict: #only add relevant chromosomes (skip Y)
                #add (start,end) region to correct chromosome
                start,end = int(split_line[1]), int(split_line[2])

                region = (start,end)
                chroms_dict[chrom_name].append(region)
    
    #all regions are stored: now put them in correct 
    for chrom_name in chroms_dict:
        sorted_regions = sorted(chroms_dict[chrom_name]) #sort regions
        points = [sorted_regions[0][0],sorted_regions[0][1]] #expand first (start,end) tuple as points in list

        #expand all other tuples: if two regions within one chromosome overlap: convert to one big region
        for region in sorted_regions[1:]:
            start,end = region
            if start > points[-1]:
                points.append(start)
                points.append(end)
            else:
                points[-1] = end

        #now save the region points in dictionary data
        chroms_dict[chrom_name] = points

    return chroms_dict

def create_data_dictionary_with_conversion(file, convert = False):
    """
    Convert .bed file to data dictionary
    
    Parameters:
        file (.bed file): each row is chrom_number \t start \t end
        convert (bool): whether or not to convert the numbers from hg19 to hg38

    Returns:
        chroms_dict (dict): each key is a chromosome (str) -> 'chr1', 'chr2', ...., 'chrX'
                            each value is list of points -> even and odd indexed points are start and end of active region
    """
    #Create empty dictionary of length 23: {chrom_number: [empty list]}
    chroms_names = ['chr'+str(i) for i in range(1,23)] + ['chrX']
    chroms_dict = {chrom_name:[] for chrom_name in chroms_names}

    #open .bed file
    with open(file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            split_line = line.split('\t') #chrom_number, start, end

            chrom_name = split_line[0]

            if chrom_name in chroms_dict: #only add relevant chromosomes (skip Y)
                #add (start,end) region to correct chromosome
                start,end = int(split_line[1]), int(split_line[2])
                
                if convert: #convert from hg19 to hg38 first
                    print('converting')
                    converted_start = lo.convert_coordinate(chrom_name ,start)
                    converted_end = lo.convert_coordinate(chrom_name ,end)

                    #add region if conversion was successful
                    if len(converted_start) > 0 and len(converted_end) > 0:
                        region = (converted_start[0][1],converted_end[0][1])
                        chroms_dict[chrom_name].append(region)
                
                else: #without conversion
                    region = (start,end)
                    chroms_dict[chrom_name].append(region)

    #all regions are stored: now put them in correct 
    for chrom_name in chroms_dict:
        sorted_regions = sorted(chroms_dict[chrom_name]) #sort regions
        points = [sorted_regions[0][0],sorted_regions[0][1]] #expand first (start,end) tuple as points in list

        #expand all other tuples: if two regions within one chromosome overlap: convert to one big region
        for region in sorted_regions[1:]:
            start,end = region
            if start > points[-1]:
                points.append(start)
                points.append(end)
            else:
                points[-1] = end

        #now save the region points in dictionary data
        chroms_dict[chrom_name] = points

    return chroms_dict

Clear the CSV files

In [5]:
# #make sure the csv files are empty
with open('region_data.csv','w', encoding='UTF8', newline='') as c:
    c.truncate()

with open('meta_data.csv','w', encoding='UTF8', newline='') as c:
    c.truncate()


Loop through each file in the folder with .bed files and store the active regions in the csv file

In [None]:
all_cell_types = [] #names of the cell types
sizes = [] #amount of points in the cell types

folder = "C:\\Users\\20192822\\Documents\\School\\Jaar B3\\Kwart 4\\BEP\\Code\\cistrome_data"

data_file_names = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] #all .bed filenames in given path
cell_types = [file_name.split('.')[0] for file_name in data_file_names] #names of the cell types of the files (remove .bed)
all_cell_types += cell_types #store cell types

#loop through each .bed file
for file_index, data_file in enumerate(data_file_names): 
    path = os.path.join(folder, data_file) #get correct path

    chroms_dict = create_data_dictionary(path) #get data dictionary

    data = [l for l in chroms_dict.values()] #convert dictionary values to 23 list of lists
    size = sum([len(l) for l in data])//2 #get amount of active regions in the dictionary
    sizes.append(size)

    #open csv file and append 23 lists as lines 
    with open('region_data.csv','a', encoding='UTF8', newline='') as c:
        writer = csv.writer(c)
        writer.writerows(data)

    print('File {} out of {} stored in csv)'.format( \
            file_index+1, len(data_file_names)))

#store the cell types names and sizes in metadata csv
with open('meta_data.csv','a', encoding='UTF8', newline='') as c:
    writer = csv.writer(c)
    writer.writerow(all_cell_types)
    writer.writerow(sizes)
    writer.writerow(['chr'+str(i) for i in range(1,23)] + ['chrX'])
    