## Setup 

In [234]:
# Conda environment to use: annot

# References for sources used to write this script:
#   - Gene annotation methods: https://medium.com/intothegenomics/annotate-genes-and-genomic-coordinates-using-python-9259efa6ffc2
#   - Gencode annotation file format: https://www.gencodegenes.org/pages/data_format.html

# figure out how to interpret the chromosome labels
# https://genome.ucsc.edu/cgi-bin/hgTracks?chromInfoPage=

# Import packages
import pandas as pd
import numpy as np
import pysam

# Assign directories and files to use for annotation references
#annotation_file = "/Users/meghansleeper/Python/deconvolution_projects/dmr_workflow/supplementary/annotation_beds/gencode.v44.annotation.all.sorted.formatted.bed.gz"
annotation_file_genes = "/Users/meghansleeper/Python/deconvolution_projects/dmr_workflow/supplementary/annotation_beds/gencode.v44.annotation.genes.sorted.formatted.bed.gz"
annotation_file_regulatory = "/Users/meghansleeper/Python/deconvolution_projects/dmr_workflow/supplementary/annotation_beds/ensembl.GRch38.p14.regulatory.sorted.formatted.bed.gz"

# DMR regions to annotate (tab-delimited bed formatted file)
DMR_file = "/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers.tsv"

# Output file names
output_file_per_gene = "/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers_annotated_per_gene.tsv"
output_file = "/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers_annotated.tsv"
output_file_per_reg = "/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers_annotated_per_reg.tsv"


In [198]:
# Reference for genes only: 
# Read reference annotation file with tabix
gencode_v44_genes = pysam.TabixFile(annotation_file_genes)
# read in the annotation file as a pandas dataframe
annot_df_gene = pd.read_csv(annotation_file_genes, sep='\t', header=None, names=['chr', 'start', 'end', 
                                                                                'attribute', 'feature', 'strand', 
                                                                                'source', 'gene_name', 'gene_id', 
                                                                                'gene_type', 'gene_level'])
# Reference for regulatory features from ensembl regulatory build:
# Read reference annotation file with tabix
regulatory_build = pysam.TabixFile(annotation_file_regulatory)
annot_df_regulatory = pd.read_csv(annotation_file_regulatory, 
                                  sep='\t', 
                                  header=0, 
                                  low_memory=False, 
                                  names=['chr', 'start', 'end',
                                         'feature_type', 'feature_type_description'])

# # Reference for all features: 
# # Read reference annotation file with tabix
# gencode_v44 = pysam.TabixFile(annotation_file)
# # read in the annotation file as a pandas dataframe
# annot_df = pd.read_csv(annotation_file, sep='\t', header=None, names=['chr', 'start', 'end', 
#                                                                       'attribute', 'feature', 'strand', 
#                                                                       'source', 'gene_name', 'gene_id', 
#                                                                       'gene_type', 'gene_level', 'exon_id', 
#                                                                       'exon_number', 'transcript_name',
#                                                                       'transcript_type'])


In [199]:
### Read the DMR sample file in to a pandas dataframe
df = pd.read_table(DMR_file, low_memory=False, header=0)

## Annotation file inspection

In [200]:
annot_df_regulatory.head()

Unnamed: 0,chr,start,end,feature_type,feature_type_description
0,chr1,12802,16450,Enhancer,Predicted enhancer region
1,chr1,13401,13600,CTCF Binding Site,CTCF binding site
2,chr1,14001,14200,CTCF Binding Site,CTCF binding site
3,chr1,15401,16600,CTCF Binding Site,CTCF binding site
4,chr1,25601,26600,CTCF Binding Site,CTCF binding site


In [201]:
# drop unwanted columns from annotation dfs
# genes only:
annot_df_gene = annot_df_gene.drop(['attribute'], axis=1)

# all features:
# annot_df = annot_df.drop(['attribute'], axis=1)

# regulatory features:
# what are the possible feature types in the regulatory build?
annot_df_regulatory.feature_type.unique()

array(['Enhancer', 'CTCF Binding Site', 'Promoter', 'Open chromatin',
       'TF binding'], dtype=object)

In [202]:
# # confirming that the gene annotation file matches the genes in all annotation file
# # genes only:
# display(annot_df_gene.loc[(annot_df_gene['chr']=='chr1')])
# # all features:
# display(annot_df.loc[(annot_df['chr']=='chr1')])

## DMR file inspection

In [203]:
# figure out syntax to add data type to silence warning --> must be specified dtype like np.int16 
# see: https://stackoverflow.com/questions/24251219/pandas-read-csv-low-memory-and-dtype-options

### Clean DMR file
# remove CpGs from lenCpG values dataframe
df['lenCpG'] = df['lenCpG'].apply(lambda x: x.split("CpGs")[0])

# remove bp from bp values in dataframe
df['bp'] = df['bp'].apply(lambda x: x.split("bp")[0])

# rename chromosome column
df.rename(columns = {'#chr':'chr'}, inplace = True)

# uncomment below line if you want to drop unwanted columns
#df = df.drop(['startCpG', 'endCpG', 'len_bp', 'region', 'tg_mean', 'bg_mean', 'delta_quants', 'delta_maxmin'], axis=1)

## Defining functions

In [204]:
# Determine if there is overlap between genes and DMRs
# query and reference
def overlap(q_st, q_end, ref_st, ref_end):
    overlap  = min(q_end, ref_end)-max(q_st, ref_st)
    return overlap

In [205]:
# # loci is a row in a dataframe `a` with columns chr, start, end ['#chr', 'start', 'end']
# # `tb` tabix is the indexed annotation file as a pysam.TabixFile object
def gencode_annotate_loci(a, tb):
    genes = []
    gene_type = []
    gene_level = []
    gene_id = []
    strand = []

    try:
        # reads the tabix file
        # columns in tabix file: ['chr', 'start', 'end', 'attribute', 'feature', 
                                # 'strand', 'source', 'gene_name', 'gene_id','
                                # gene_type', 'gene_level'])
        
        for region in tb.fetch(a['chr'], int(a['start']), int(a['end'])):
            if region:
                r = region.split('\t')

                ### Calculate length of the overlap between the query interval and the gencode gene
                overlap_len = overlap(int(a['start']), int(a['end']), int(r[1]), int(r[2]))

                ### first value is the percentage of the query interval that overlap with the gencode gene
                ### second value is the percentage of the gencode gene that overlaps with the query interval
                ret_val = '{}({})({})'.format(r[7], 
                                              np.round(overlap_len/float(int(a['end'])-int(a['start']))*100, 2), 
                                              np.round(overlap_len/float(int(r[2])-int(r[1]))*100, 2)
                                              ) 

                genes.append(ret_val) 
                gene_type.append(r[9])
                gene_level.append(r[10])
                gene_id.append(r[8])
                strand.append(r[5])
    
        if len(genes)>0:
            return [";".join(genes), ";".join(gene_type), ";".join(gene_level), ";".join(gene_id), ";".join(strand)]
        else:
            return ["NA(0)(0)", "NA", np.nan, "NA", "NA"]
    except ValueError:
        return ["NA(0)(0)", "NA", np.nan, "NA", "NA"]

In [206]:
# function to annotated DMRs with regulatory features and percent coverage
# # loci is a row in a dataframe `a` with columns chr, start, end ['#chr', 'start', 'end']
# # `tb` tabix is the indexed annotation file as a pysam.TabixFile object
def regulatory_annotate_loci(a, tb):
    reg_features = []

    try:
        # reads the tabix file
        # columns in tabix file: ['chr', 'start', 'end', 'feature_type', 'feature_type_description']
        
        for region in tb.fetch(a['chr'], int(a['start']), int(a['end'])):
            if region:
                r = region.split('\t')

                ### Calculate length of the overlap between the query interval and the gencode gene
                overlap_len = overlap(int(a['start']), int(a['end']), int(r[1]), int(r[2]))

                ### first value is the percentage of the query interval that overlap with the gencode gene
                ### second value is the percentage of the gencode gene that overlaps with the query interval
                ret_val = '{}({})({})'.format(r[3], 
                                              np.round(overlap_len/float(int(a['end'])-int(a['start']))*100, 2), 
                                              np.round(overlap_len/float(int(r[2])-int(r[1]))*100, 2)
                                              ) 

                reg_features.append(ret_val)
    
        if len(reg_features)>0:
            return [";".join(reg_features)]
        else:
            return ["NA(0)(0)"]
    except ValueError:
        return ["NA(0)(0)"]

In [207]:
# Function to count the number of genes found for each region 
# by counting the number of semicolons in the gene_name or regulatory_feature column
def count_features(gene):
    if gene == 'NA(0)(0)':
        return 0
    else:
        return gene.count(";") + 1

In [208]:
# Function to return if region is intergenic or intragenic
def genic(gene):
    if gene == 'NA(0)(0)':
        return "intergenic"
    else:
        return "intragenic"

In [209]:
# Function to return if region has regulatory features
def regulatory(regulatory_feature):
    if regulatory_feature == 'NA(0)(0)':
        return "no"
    else:
        return "yes"

In [210]:
# write a function to split the deliminates gene columns into lists
def split_gene(gene_name, gene_type, gene_level, gene_id, strand):
    gene_list = gene_name.split(";")
    gt_list = str(gene_type).split(";") 
    gl_list = str(gene_level).split(";")
    gid_list = str(gene_id).split(";")
    strand_list = str(strand).split(";")
    return gene_list, gt_list, gl_list, gid_list, strand_list

In [211]:
def split_regulatory(regulatory_feature):
    regulatory_list = regulatory_feature.split(";")
    return regulatory_list

## Gene annotation

### Annotate DMRs with gene information from gencode V44

In [212]:
# annotate dmrs using gencode_annotate_loci function and gencode_v44 tabix annotation file
# returns genes, feature, gene_type, gene_level, exon_id, exon_number, transcript_name, transcript_type that overlap with DMRs
# returns amount of overlap as a percentage under gene column as gene-name(overlap %)(overlap %)
df[['gene_name', 'gene_type', 'gene_level', 'gene_id', 'strand']] = df.apply(lambda x: gencode_annotate_loci(x[['chr', 'start', 'end']], gencode_v44_genes), axis=1, result_type='expand')

# replace empty strings with NaN
df = df.replace(r'^\s*$', np.nan, regex=True)

### Count number of genes in each region

In [213]:
# count the number of genes found for each region and add to g_nums list
g_nums = []
for i,r in df.iterrows():
    g_num = count_features(str(r['gene_name']))
    g_nums.append(g_num)

# add gene number column to dataframe
df['gene_num'] = g_nums
print("Gene counts added to dataframe.")

# return max number of genes found in one region and preview of regions with max number of genes
max_g = max(g_nums)
print("Max number of genes in one region is {}.".format(max_g))
print("Regions with max number of genes:")
display(df.loc[(df['gene_num']==max_g)])

Gene counts added to dataframe.
Max number of genes in one region is 22.
Regions with max number of genes:


Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,delta_quants,delta_maxmin,ttest,direction,gene_name,gene_type,gene_level,gene_id,strand,gene_num
15177,chr5,141508878,141509045,8904372,8904378,colon_cancer,chr5:141508878-141509045,6,167,0.661,...,0.344,0.324,0.112,M,PCDHGA1(100.0)(0.09);PCDHGA2(100.0)(0.1);PCDHG...,protein_coding;protein_coding;protein_coding;p...,2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2,ENSG00000204956.6;ENSG00000081853.15;ENSG00000...,+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+,22
15178,chr5,141512826,141512943,8904452,8904458,colon_cancer,chr5:141512826-141512943,6,117,0.592,...,0.374,0.309,0.999,M,PCDHGA1(100.0)(0.06);PCDHGA2(100.0)(0.07);PCDH...,protein_coding;protein_coding;protein_coding;p...,2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2,ENSG00000204956.6;ENSG00000081853.15;ENSG00000...,+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+,22


### Creating a variable for intergenic or intragenic classification

In [214]:
# apply genic function to df
df['genic'] = df['gene_name'].apply(genic)

## Regulatory feature annotation

### Annotate DMRs with regulatory information from ENSEMBL regulatory build

In [215]:
# annotate dmrs using regulatory_annotate_loci function and regulatory build tabix annotation file
# returns regulatory feature types that overlap with DMRs
# returns amount of overlap as a percentage under regulatory_feature column as feature-name(overlap %)(overlap %)
df[['regulatory_feature']] = df.apply(lambda x: regulatory_annotate_loci(x[['chr', 'start', 'end']], regulatory_build), axis=1, result_type='expand')

# replace empty strings with NaN
df = df.replace(r'^\s*$', np.nan, regex=True)

### Count number of regulatory features in each region

In [216]:
# count the number of genes found for each region and add to g_nums list
reg_nums = []
for i,r in df.iterrows():
    reg_num = count_features(str(r['regulatory_feature']))
    reg_nums.append(reg_num)

# add gene number column to dataframe
df['reg_num'] = reg_nums
print("Regulatory feature counts added to dataframe.")

# return max number of genes found in one region and preview of regions with max number of genes
max_g = max(reg_nums)
print("Max number of regulatory features in one region is {}.".format(max_g))
print("Regions with max number of regulatory features:")
display(df.loc[(df['reg_num']==max_g)])

Regulatory feature counts added to dataframe.
Max number of regulatory features in one region is 5.
Regions with max number of regulatory features:


Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,direction,gene_name,gene_type,gene_level,gene_id,strand,gene_num,genic,regulatory_feature,reg_num
39995,chr19,50115698,50117527,25827266,25827321,colon_cancer,chr19:50115698-50117527,55,1829,0.531,...,U,ENSG00000287001(100.0)(3.49),lncRNA,2,ENSG00000287001.3,-,1,intragenic,TF binding(24.99)(88.05);CTCF Binding Site(32....,5


### Creating a variable indicating if there is a regulatory feature in the region

In [217]:
# apply regulatory function to df
df['regulatory'] = df['regulatory_feature'].apply(regulatory)

## Observing DMR annotations and comparing to annotation files

In [218]:
df.head()
# annot_df_gene.head()
# annot_df_regulatory.head()


Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,gene_name,gene_type,gene_level,gene_id,strand,gene_num,genic,regulatory_feature,reg_num,regulatory
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,ENSG00000238009(100.0)(0.94),lncRNA,2,ENSG00000238009.6,-,1,intragenic,NA(0)(0),0,no
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,ENSG00000238009(100.0)(1.05),lncRNA,2,ENSG00000238009.6,-,1,intragenic,NA(0)(0),0,no
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,ENSG00000238009(100.0)(0.46);CICP27(100.0)(5.41),lncRNA;processed_pseudogene,2;1,ENSG00000238009.6;ENSG00000233750.3,-;+,2,intragenic,NA(0)(0),0,no
3,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,ENSG00000228463(100.0)(1.78),transcribed_processed_pseudogene,1,ENSG00000228463.10,-,1,intragenic,Enhancer(1.38)(12.56),1,yes
4,chr1,527011,528180,4884,4906,colon_cancer,chr1:527011-528180,22,1169,0.435,...,NA(0)(0),,,,,0,intergenic,NA(0)(0),0,no


In [219]:
# checking proper annotation (uncomment below lines to check different things)

# # checking by chromosome
# display(df.loc[(df['chr']=='chr4')])
# display(annot_df_gene.loc[(annot_df_gene['chr']=='chr4')])
# display(annot_df_regulatory.loc[(annot_df_gene['chr']=='chr4')])

# # checking for specific genes
# display(df.loc[(annot_df['gene_name']=='SEPTIN9') & (df['chr']=='chr17')])
# display(annot_df_gene.loc[(annot_df_gene['gene_name']=='SEPTIN9') & (annot_df_gene['chr']=='chr17')])
# display(annot_df_regulatory.loc[annot_df_regulatory['chr']=='chr17'])

# # checking by chromosome and start/end
# df_check = display(df.loc[(dmr['chr']=='chr17') & (df['start'].between(77280569,77500596))])
# annot_check = display(annot_df_gene.loc[(annot_df_gene['chr']=='chr17') & (annot_df_gene['start'].between(77280569,77500596))])
# annot_check_reg = display(annot_df_regulatory.loc[(annot_df_regulatory['chr']=='chr17') & (annot_df_regulatory['start'].between(77280569,77500596))])


## Clean annotations 

### Split Gene names
Splits gene names out for regions with more than one gene and creates new rows for df called `df_perGene`.

In [220]:
# split values in the columns gene, gene_type, gene_level by delimiter ; and add to a list of lists called new_rows
new_rows = []
for i,r in df.iterrows():
    g_list, gt_list, gl_list, gid_list, strand_list = split_gene(r['gene_name'], r['gene_type'], r['gene_level'], r['gene_id'], r['strand'])

    for g in range(len(g_list)):
        new_rows.append(np.append(r[['chr', 'start', 'end', 'startCpG', 
                                     'endCpG', 'target', 'region', 'lenCpG', 
                                     'bp', 'tg_mean', 'bg_mean', 'delta_means', 
                                     'delta_quants', 'delta_maxmin', 'ttest', 
                                     'direction', 'gene_name', 'gene_type',
                                     'gene_level', 'gene_id', 'strand', 'genic', 'gene_num']
                                     ].values, [g_list[g], gt_list[g], gl_list[g], 
                                                gid_list[g], strand_list[g]
                                                ]))

In [221]:
# # Check df against new rows to confirm proper splitting
# df.head()
# print(len(new_rows))
# print(new_rows[0:5])

In [222]:
# Add new rows to add to the dataframe
df_perGene = pd.DataFrame(new_rows, columns=['chr', 'start', 'end', 'startCpG',
                                            'endCpG', 'target', 'region', 'lenCpG',
                                            'bp', 'tg_mean', 'bg_mean', 'delta_means',
                                            'delta_quants', 'delta_maxmin', 'ttest', 
                                            'direction', 'gene_name', 'gene_type',
                                            'gene_level', 'gene_id', 'strand', 
                                            'genic', 'gene_num', 'gene_name_split', 
                                            'gene_type_split', 'gene_level_split',
                                            'gene_id_split', 'strand_split'])

df_perGene.head()


Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,gene_level,gene_id,strand,genic,gene_num,gene_name_split,gene_type_split,gene_level_split,gene_id_split,strand_split
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,2,ENSG00000238009.6,-,intragenic,1,ENSG00000238009(100.0)(0.94),lncRNA,2,ENSG00000238009.6,-
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,2,ENSG00000238009.6,-,intragenic,1,ENSG00000238009(100.0)(1.05),lncRNA,2,ENSG00000238009.6,-
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,2;1,ENSG00000238009.6;ENSG00000233750.3,-;+,intragenic,2,ENSG00000238009(100.0)(0.46),lncRNA,2,ENSG00000238009.6,-
3,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,2;1,ENSG00000238009.6;ENSG00000233750.3,-;+,intragenic,2,CICP27(100.0)(5.41),processed_pseudogene,1,ENSG00000233750.3,+
4,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,1,ENSG00000228463.10,-,intragenic,1,ENSG00000228463(100.0)(1.78),transcribed_processed_pseudogene,1,ENSG00000228463.10,-


### Split coverage percentages from gene names

In [223]:
# Split gene_name column into gene_name, region_coverage, gene_coverage
df_perGene['gene_name_cleaned'] = df_perGene['gene_name_split'].apply(lambda x: x.split("(")[0].replace(")", ""))
df_perGene['region_coverage'] = df_perGene['gene_name_split'].apply(lambda x: x.split("(")[1].replace(")", ""))
df_perGene['gene_coverage'] = df_perGene['gene_name_split'].apply(lambda x: x.split("(")[2].replace(")", ""))

df_perGene.head(10)

Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,genic,gene_num,gene_name_split,gene_type_split,gene_level_split,gene_id_split,strand_split,gene_name_cleaned,region_coverage,gene_coverage
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,intragenic,1,ENSG00000238009(100.0)(0.94),lncRNA,2.0,ENSG00000238009.6,-,ENSG00000238009,100.0,0.94
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,intragenic,1,ENSG00000238009(100.0)(1.05),lncRNA,2.0,ENSG00000238009.6,-,ENSG00000238009,100.0,1.05
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,intragenic,2,ENSG00000238009(100.0)(0.46),lncRNA,2.0,ENSG00000238009.6,-,ENSG00000238009,100.0,0.46
3,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,intragenic,2,CICP27(100.0)(5.41),processed_pseudogene,1.0,ENSG00000233750.3,+,CICP27,100.0,5.41
4,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,intragenic,1,ENSG00000228463(100.0)(1.78),transcribed_processed_pseudogene,1.0,ENSG00000228463.10,-,ENSG00000228463,100.0,1.78
5,chr1,527011,528180,4884,4906,colon_cancer,chr1:527011-528180,22,1169,0.435,...,intergenic,0,NA(0)(0),,,,,,0.0,0.0
6,chr1,588665,588858,5055,5062,colon_cancer,chr1:588665-588858,7,193,0.122,...,intragenic,2,ENSG00000230021(100.0)(0.08),transcribed_processed_pseudogene,2.0,ENSG00000230021.10,-,ENSG00000230021,100.0,0.08
7,chr1,588665,588858,5055,5062,colon_cancer,chr1:588665-588858,7,193,0.122,...,intragenic,2,ENSG00000235146(100.0)(2.7),lncRNA,2.0,ENSG00000235146.2,+,ENSG00000235146,100.0,2.7
8,chr1,598963,599165,5342,5347,colon_cancer,chr1:598963-599165,5,202,0.293,...,intragenic,1,ENSG00000230021(100.0)(0.08),transcribed_processed_pseudogene,2.0,ENSG00000230021.10,-,ENSG00000230021,100.0,0.08
9,chr1,603632,604015,5406,5412,colon_cancer,chr1:603632-604015,6,383,0.3,...,intragenic,1,ENSG00000230021(100.0)(0.16),transcribed_processed_pseudogene,2.0,ENSG00000230021.10,-,ENSG00000230021,100.0,0.16


### Clean and reorganize df_perGene

In [224]:
# once I have confirmed proper annotation, I can clean up the df
# columns in df_per_gene: ['chr', 'start', 'end', 'startCpG', 'endCpG', 'target', 'region',
                    #    'lenCpG', 'bp', 'tg_mean', 'bg_mean', 'delta_means', 'delta_quants',
                    #    'delta_maxmin', 'ttest', 'direction', 'gene_name', 'gene_type',
                    #    'gene_level', 'gene_id', 'strand', 'gene_num', 'gene_name_split',
                    #    'gene_type_split', 'gene_level_split', 'gene_id_split', 'strand_split',
                    #    'gene_name_cleaned', 'region_coverage', 'gene_coverage']

## drop the genes column
df_perGene = df_perGene.drop(['gene_name', 'gene_type','gene_level', 
                              'gene_id', 'strand', 'gene_name_split'], axis=1)

# rename columns
df_perGene.rename(columns = {'gene_name_cleaned':'gene_name', 
                              'gene_id_split':'gene_ID',
                              'strand_split':'strand',
                              'gene_type_split':'gene_type',
                              'gene_level_split':'gene_level',
                              'gene_num':'num_genes',
                              'gene_coverage':'perc_gene_cov_region',
                              'region_coverage':'perc_region_cov_gene'
                              }, inplace = True)

# reorder columns
df_perGene = df_perGene.loc[:,['chr', 'start', 'end', 'startCpG', 
                                 'endCpG', 'target', 'region', 'lenCpG', 
                                 'bp', 'tg_mean', 'bg_mean', 'delta_means', 
                                 'delta_quants', 'delta_maxmin', 'ttest', 
                                 'direction', 'num_genes', 'gene_name', 
                                 'perc_gene_cov_region', 'perc_region_cov_gene',
                                 'gene_type', 'gene_level', 'gene_ID', 'strand', 
                                 'genic']]

df_perGene.head()

Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,direction,num_genes,gene_name,perc_gene_cov_region,perc_region_cov_gene,gene_type,gene_level,gene_ID,strand,genic
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,U,1,ENSG00000238009,0.94,100.0,lncRNA,2,ENSG00000238009.6,-,intragenic
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,U,1,ENSG00000238009,1.05,100.0,lncRNA,2,ENSG00000238009.6,-,intragenic
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,U,2,ENSG00000238009,0.46,100.0,lncRNA,2,ENSG00000238009.6,-,intragenic
3,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,U,2,CICP27,5.41,100.0,processed_pseudogene,1,ENSG00000233750.3,+,intragenic
4,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,U,1,ENSG00000228463,1.78,100.0,transcribed_processed_pseudogene,1,ENSG00000228463.10,-,intragenic


### Split regulatory features

Splits regulatory features out for regions with more than one regulatory feature and creates new rows for df called `df_perReg`.

In [225]:
# split values in the columns gene, gene_type, gene_level by delimiter ; and add to a list of lists called new_rows
new_rows = []
for i,r in df.iterrows():
    reg_list = split_regulatory(r['regulatory_feature'])

    for reg in range(len(reg_list)):
        new_rows.append(np.append(r[['chr', 'start', 'end', 'startCpG', 
                                     'endCpG', 'target', 'region', 'lenCpG', 
                                     'bp', 'tg_mean', 'bg_mean', 'delta_means', 
                                     'delta_quants', 'delta_maxmin', 'ttest', 
                                     'direction', 'gene_name', 'gene_type',
                                     'gene_level', 'gene_id', 'strand', 'genic', 
                                     'gene_num','regulatory_feature', 'reg_num', 
                                     'regulatory']
                                     ].values, [reg_list[reg]]))

In [226]:
# # compare the df against new rows to confirm proper splitting
# df.head()
# print(len(new_rows))
# print(new_rows[0:10])

In [227]:
# Add new rows to add to the dataframe (df_perReg)
df_perReg = pd.DataFrame(new_rows, columns=['chr', 'start', 'end', 'startCpG', 
                                            'endCpG', 'target', 'region', 'lenCpG', 
                                            'bp', 'tg_mean', 'bg_mean', 'delta_means', 
                                            'delta_quants', 'delta_maxmin', 'ttest', 
                                            'direction', 'gene_name', 'gene_type',
                                            'gene_level', 'gene_id', 'strand', 'genic', 
                                            'gene_num','regulatory_feature', 'reg_num', 
                                            'regulatory', 'reg_split'])

### Split coverage percentages from regulatory feature type

In [228]:
# Split regulatory_feature column into reg feature type, region_coverage, feature_coverage
df_perReg['reg_name_cleaned'] = df_perReg['reg_split'].apply(lambda x: x.split("(")[0].replace(")", ""))
df_perReg['perc_reg_cov_region'] = df_perReg['reg_split'].apply(lambda x: x.split("(")[1].replace(")", ""))
df_perReg['perc_region_cov_reg'] = df_perReg['reg_split'].apply(lambda x: x.split("(")[2].replace(")", ""))

df_perReg.head(20)

Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,strand,genic,gene_num,regulatory_feature,reg_num,regulatory,reg_split,reg_name_cleaned,perc_reg_cov_region,perc_region_cov_reg
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,-,intragenic,1,NA(0)(0),0,no,NA(0)(0),,0.0,0.0
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,-,intragenic,1,NA(0)(0),0,no,NA(0)(0),,0.0,0.0
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,-;+,intragenic,2,NA(0)(0),0,no,NA(0)(0),,0.0,0.0
3,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,-,intragenic,1,Enhancer(1.38)(12.56),1,yes,Enhancer(1.38)(12.56),Enhancer,1.38,12.56
4,chr1,527011,528180,4884,4906,colon_cancer,chr1:527011-528180,22,1169,0.435,...,,intergenic,0,NA(0)(0),0,no,NA(0)(0),,0.0,0.0
5,chr1,588665,588858,5055,5062,colon_cancer,chr1:588665-588858,7,193,0.122,...,-;+,intragenic,2,NA(0)(0),0,no,NA(0)(0),,0.0,0.0
6,chr1,598963,599165,5342,5347,colon_cancer,chr1:598963-599165,5,202,0.293,...,-,intragenic,1,NA(0)(0),0,no,NA(0)(0),,0.0,0.0
7,chr1,603632,604015,5406,5412,colon_cancer,chr1:603632-604015,6,383,0.3,...,-,intragenic,1,NA(0)(0),0,no,NA(0)(0),,0.0,0.0
8,chr1,605175,605246,5438,5446,colon_cancer,chr1:605175-605246,8,71,0.454,...,-,intragenic,1,CTCF Binding Site(35.21)(12.56),1,yes,CTCF Binding Site(35.21)(12.56),CTCF Binding Site,35.21,12.56
9,chr1,605698,607036,5475,5494,colon_cancer,chr1:605698-607036,19,1338,0.29,...,-,intragenic,1,NA(0)(0),0,no,NA(0)(0),,0.0,0.0


In [229]:
df_perReg.columns

Index(['chr', 'start', 'end', 'startCpG', 'endCpG', 'target', 'region',
       'lenCpG', 'bp', 'tg_mean', 'bg_mean', 'delta_means', 'delta_quants',
       'delta_maxmin', 'ttest', 'direction', 'gene_name', 'gene_type',
       'gene_level', 'gene_id', 'strand', 'genic', 'gene_num',
       'regulatory_feature', 'reg_num', 'regulatory', 'reg_split',
       'reg_name_cleaned', 'perc_reg_cov_region', 'perc_region_cov_reg'],
      dtype='object')

### Clean and reorganize df_perReg

In [233]:
# once I have confirmed proper annotation, I can clean up the df
# columns in df_perReg:  ['chr', 'start', 'end', 'startCpG', 'endCpG', 'target', 'region',
                    #    'lenCpG', 'bp', 'tg_mean', 'bg_mean', 'delta_means', 'delta_quants',
                    #    'delta_maxmin', 'ttest', 'direction', 'gene_name', 'gene_type',
                    #    'gene_level', 'gene_id', 'strand', 'genic', 'gene_num',
                    #    'regulatory_feature', 'reg_num', 'regulatory', 'reg_split',
                    #    'reg_name_cleaned', 'perc_reg_cov_region', 'perc_region_cov_reg']

## drop the genes column
df_perReg = df_perReg.drop(['regulatory_feature','reg_split'], axis=1)

# rename columns
df_perReg.rename(columns = {'reg_name_cleaned':'regulatory_feature',
                             'reg_num':'num_regs',
                             'gene_num':'num_genes'
                             }, inplace = True)

# reorder columns
df_perReg = df_perReg.loc[:,['chr', 'start', 'end', 'startCpG', 
                             'endCpG', 'target', 'region', 'lenCpG', 
                             'bp', 'tg_mean', 'bg_mean', 'delta_means',
                             'delta_quants', 'delta_maxmin', 'ttest',
                             'direction', 'num_regs', 'regulatory_feature', 
                             'perc_reg_cov_region', 'perc_region_cov_reg', 
                             'num_genes', 'gene_name', 'gene_type', 
                             'gene_level', 'gene_id', 'strand', 'genic', 
                             'regulatory']]

df_perGene.head()

Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,direction,num_genes,gene_name,perc_gene_cov_region,perc_region_cov_gene,gene_type,gene_level,gene_ID,strand,genic
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,U,1,ENSG00000238009,0.94,100.0,lncRNA,2,ENSG00000238009.6,-,intragenic
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,U,1,ENSG00000238009,1.05,100.0,lncRNA,2,ENSG00000238009.6,-,intragenic
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,U,2,ENSG00000238009,0.46,100.0,lncRNA,2,ENSG00000238009.6,-,intragenic
3,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,U,2,CICP27,5.41,100.0,processed_pseudogene,1,ENSG00000233750.3,+,intragenic
4,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,U,1,ENSG00000228463,1.78,100.0,transcribed_processed_pseudogene,1,ENSG00000228463.10,-,intragenic


### Save files

df has columns: ['chr', 'start', 'end', 'startCpG', 'endCpG', 'target', 'region',
                'lenCpG', 'bp', 'tg_mean', 'bg_mean', 'delta_means', 'delta_quants',
                'delta_maxmin', 'ttest', 'direction', 'gene_name', 'gene_type',
                'gene_level', 'gene_id', 'strand', 'gene_num', 'genic']

In [235]:
df_perGene.to_csv(output_file_per_gene, index=False, header=True, sep="\t")
df_perReg.to_csv(output_file_per_reg, index=False, header=True, sep="\t")
df.to_csv(output_file, index=False, header=True, sep="\t")

print(output_file + " with all regions (partial annotation/no dups) has been created")
print(output_file_per_gene + " with all regions (dup regions to split genes) has been created")
print(output_file_per_reg + " with all regions (dup regions to split regulatory features) has been created")

/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers_annotated.tsv with all regions (partial annotation/no dups) has been created
/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers_annotated_per_gene.tsv with all regions (dup regions to split genes) has been created


## Misc. code chunks from dev

In [None]:
# # display dmr dataframe where gene_num is greater than 1
# df.loc[(df['gene_num'] > 1)]

# # return rows where feature is exon in df_perGene
# df_perGene[df_perGene.feature=='exon'].tail(25)

## skip if you want to keep regions with no genes
## Transforms the data frame and removes anything that returned NA
## Remove all the intervals that do not overlap with genes
# df = df[df['genes'] != "NA(0)"].reset_index(drop=True)

# # display items in list of lists
# print(new_rows[0:5])

# # listing column names
# column_names = list(df.columns.values)
# print(column_names)

# # exporting top 100 regions with lowest p-values for annotation in browser (USC Genome Browser has gencode V44 annotation)
# df.sort_values(by=['ttest'], inplace=True)
# dmr_exp = df.loc[(df['ttest']<0.050)].head(100)
# dmr_exp = dmr_exp.drop(['startCpG', 'endCpG', 'target', 'bp',
#                    'lenCpG', 'region', 'tg_mean', 'bg_mean', 
#                    'delta_means', 'delta_quants', 'delta_maxmin',
#                    'ttest', 'direction'], axis=1).head(100)
# dmr_exp.to_csv('top100_colon_markers.tsv', sep="\t", header=False, index=False) 
# dmr_exp.head()

# # create dataframe with only significant DMRs
# dmr = df_perGene.loc[(df_perGene['ttest'] < 0.05)]

## Questions for keller

What do the various chromosome labels mean? 
Are these contigs that are not included in chromosome contructed references?
    Does `M` refer to mitrochondrial?

In [8]:
# figure out how to interpret the chromosome labels
# https://genome.ucsc.edu/cgi-bin/hgTracks?chromInfoPage=
annot_df.chr.unique()

array(['GL000009.2', 'GL000194.1', 'GL000195.1', 'GL000205.2',
       'GL000209.2', 'GL000213.1', 'GL000216.2', 'GL000218.1',
       'GL000219.1', 'GL000220.1', 'GL000225.1', 'GL000250.2',
       'GL000251.2', 'GL000252.2', 'GL000253.2', 'GL000254.2',
       'GL000255.2', 'GL000256.2', 'GL000257.2', 'GL000258.2',
       'GL339449.2', 'GL383518.1', 'GL383519.1', 'GL383520.2',
       'GL383521.1', 'GL383522.1', 'GL383526.1', 'GL383527.1',
       'GL383528.1', 'GL383531.1', 'GL383532.1', 'GL383533.1',
       'GL383534.2', 'GL383539.1', 'GL383540.1', 'GL383541.1',
       'GL383542.1', 'GL383545.1', 'GL383546.1', 'GL383550.2',
       'GL383551.1', 'GL383552.1', 'GL383553.2', 'GL383554.1',
       'GL383555.2', 'GL383556.1', 'GL383557.1', 'GL383563.3',
       'GL383564.2', 'GL383565.1', 'GL383566.1', 'GL383567.1',
       'GL383569.1', 'GL383570.1', 'GL383571.1', 'GL383572.1',
       'GL383573.1', 'GL383574.1', 'GL383575.2', 'GL383576.1',
       'GL383577.2', 'GL383579.2', 'GL383580.2', 'GL383