## Setup 

In [25]:
# Conda environment to use: annot

# References for sources used to write this script:
#   - Gene annotation methods: https://medium.com/intothegenomics/annotate-genes-and-genomic-coordinates-using-python-9259efa6ffc2
#   - Gencode annotation file format: https://www.gencodegenes.org/pages/data_format.html

# figure out how to interpret the chromosome labels
# https://genome.ucsc.edu/cgi-bin/hgTracks?chromInfoPage=

# Import packages
import pandas as pd
import numpy as np
import pysam

# Assign directories and files to use for annotation reference, DMRs, and output
annotation_file = "/Users/meghansleeper/Desktop/farm-files/dmr-workflow/supplementary/annotation_beds/gencode.v44.annotation.all.sorted.formatted.bed.gz"
annotation_file_genes = "/Users/meghansleeper/Desktop/farm-files/dmr-workflow/supplementary/annotation_beds/gencode.v44.annotation.genes.sorted.formatted.bed.gz"
DMR_file = "/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers.tsv"
output_file_per_gene = "/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers_annotated_per_gene.tsv"
output_file = "/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers_annotated.tsv"
dmr_output_file = "/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers_annotated_dmrs.tsv"

# Read reference annotation file with tabix
gencode_v44 = pysam.TabixFile(annotation_file)
gencode_v44_genes = pysam.TabixFile(annotation_file_genes)

# read annotation files into dataframes
annot_df = pd.read_csv(annotation_file, sep='\t', header=None, names=['chr', 'start', 'end', 
                                                                      'attribute', 'feature', 'strand', 
                                                                      'source', 'gene_name', 'gene_id', 
                                                                      'gene_type', 'gene_level', 'exon_id', 
                                                                      'exon_number', 'transcript_name',
                                                                      'transcript_type'])


annot_df_gene = pd.read_csv(annotation_file_genes, sep='\t', header=None, names=['chr', 'start', 'end', 
                                                                                'attribute', 'feature', 'strand', 
                                                                                'source', 'gene_name', 'gene_id', 
                                                                                'gene_type', 'gene_level'])

### Read the sample file in to a pandas dataframe
df = pd.read_table(DMR_file, low_memory=False, header=0)



## Annotation file inspection

In [26]:
# drop unwanted columns from annotation dfs
annot_df_gene = annot_df_gene.drop(['attribute'], axis=1)
annot_df = annot_df.drop(['attribute'], axis=1)

In [27]:
# # confirming that the gene annotation file matches the genes in all annotation file
display(annot_df.loc[(annot_df['chr']=='chr1')])
display(annot_df_gene.loc[(annot_df_gene['chr']=='chr1')])

Unnamed: 0,chr,start,end,feature,strand,source,gene_name,gene_id,gene_type,gene_level,exon_id,exon_number,transcript_name,transcript_type
196077,chr1,11869,12227,exon,+,HAVANA,DDX11L2,ENSG00000290825.1,lncRNA,2,ENSE00002234944.1,1.0,DDX11L2-202,lncRNA
196078,chr1,11869,14409,gene,+,HAVANA,DDX11L2,ENSG00000290825.1,lncRNA,2,,,,
196079,chr1,11869,14409,transcript,+,HAVANA,DDX11L2,ENSG00000290825.1,lncRNA,2,,,DDX11L2-202,lncRNA
196080,chr1,12010,12057,exon,+,HAVANA,DDX11L1,ENSG00000223972.6,transcribed_unprocessed_pseudogene,2,ENSE00001948541.1,1.0,DDX11L1-201,transcribed_unprocessed_pseudogene
196081,chr1,12010,13670,gene,+,HAVANA,DDX11L1,ENSG00000223972.6,transcribed_unprocessed_pseudogene,2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388929,chr1,248918364,248919146,three_prime_UTR,+,HAVANA,PGBD2,ENSG00000185220.12,protein_coding,2,ENSE00001312561.6,3.0,PGBD2-201,protein_coding
388930,chr1,248918364,248919946,three_prime_UTR,+,HAVANA,PGBD2,ENSG00000185220.12,protein_coding,2,ENSE00001400348.4,3.0,PGBD2-202,protein_coding
388931,chr1,248936581,248937043,gene,+,HAVANA,RPL23AP25,ENSG00000233084.2,processed_pseudogene,1,,,,
388932,chr1,248936581,248937043,transcript,+,HAVANA,RPL23AP25,ENSG00000233084.2,processed_pseudogene,1,,,RPL23AP25-201,processed_pseudogene


Unnamed: 0,chr,start,end,feature,strand,source,gene_name,gene_id,gene_type,gene_level
7416,chr1,11869,14409,gene,+,HAVANA,DDX11L2,ENSG00000290825.1,lncRNA,2
7417,chr1,12010,13670,gene,+,HAVANA,DDX11L1,ENSG00000223972.6,transcribed_unprocessed_pseudogene,2
7418,chr1,14404,29570,gene,-,HAVANA,WASH7P,ENSG00000227232.5,unprocessed_pseudogene,2
7419,chr1,17369,17436,gene,-,ENSEMBL,MIR6859-1,ENSG00000278267.1,miRNA,3
7420,chr1,29554,31109,gene,+,HAVANA,MIR1302-2HG,ENSG00000243485.5,lncRNA,2
...,...,...,...,...,...,...,...,...,...,...
13070,chr1,248850006,248859144,gene,-,HAVANA,ZNF692,ENSG00000171163.16,protein_coding,1
13071,chr1,248859164,248864796,gene,+,HAVANA,ZNF692-DT,ENSG00000227237.1,lncRNA,1
13072,chr1,248906196,248919946,gene,+,HAVANA,PGBD2,ENSG00000185220.12,protein_coding,2
13073,chr1,248912690,248912795,gene,-,ENSEMBL,RNU6-1205P,ENSG00000200495.1,snRNA,3


## DMR file inspection

In [28]:
# figure out syntax to add data type to silence warning --> must be specified dtype like np.int16 
# see: https://stackoverflow.com/questions/24251219/pandas-read-csv-low-memory-and-dtype-options

### Clean DMR file
# remove CpGs from lenCpG values dataframe
df['lenCpG'] = df['lenCpG'].apply(lambda x: x.split("CpGs")[0])

# remove bp from bp values in dataframe
df['bp'] = df['bp'].apply(lambda x: x.split("bp")[0])

# rename chromosome column
df.rename(columns = {'#chr':'chr'}, inplace = True)

# uncomment below line if you want to drop unwanted columns
#df = df.drop(['startCpG', 'endCpG', 'len_bp', 'region', 'tg_mean', 'bg_mean', 'delta_quants', 'delta_maxmin'], axis=1)

## Defining functions

In [29]:
# Determine if there is overlap between genes and DMRs
# query and reference
def overlap(q_st, q_end, ref_st, ref_end):
    overlap  = min(q_end, ref_end)-max(q_st, ref_st)
    return overlap


# # loci is a row in a dataframe `a` with columns chr, start, end ['#chr', 'start', 'end']
# # `tb` tabix is the indexed annotation file as a pysam.TabixFile object
def gencode_annotate_loci(a, tb):
    genes = []
    gene_type = []
    gene_level = []
    gene_id = []
    strand = []

    try:
        # reads the tabix file
        # columns in tabix file: ['chr', 'start', 'end', 'attribute', 'feature', 
                                # 'strand', 'source', 'gene_name', 'gene_id','
                                # gene_type', 'gene_level'])
        
        for region in tb.fetch(a['chr'], int(a['start']), int(a['end'])):
            if region:
                r = region.split('\t')

                ### Calculate length of the overlap between the query interval and the gencode gene
                overlap_len = overlap(int(a['start']), int(a['end']), int(r[1]), int(r[2]))

                ### first value is the percentage of the query interval that overlap with the gencode gene
                ### second value is the percentage of the gencode gene that overlaps with the query interval
                ret_val = '{}({})({})'.format(r[7], 
                                              np.round(overlap_len/float(int(a['end'])-int(a['start']))*100, 2), 
                                              np.round(overlap_len/float(int(r[2])-int(r[1]))*100, 2)
                                              ) 

                genes.append(ret_val) 
                gene_type.append(r[9])
                gene_level.append(r[10])
                gene_id.append(r[8])
                strand.append(r[5])
    
        if len(genes)>0:
            return [";".join(genes), ";".join(gene_type), ";".join(gene_level), ";".join(gene_id), ";".join(strand)]
        else:
            return ["NA(0)(0)", "NA", np.nan, "NA", "NA"]
    except ValueError:
        return ["NA(0)(0)", "NA", np.nan, "NA", "NA"]

## Annotate DMRs

In [30]:
# annotate dmrs using gencode_annotate_loci function and gencode_v44 annotation file
# returns genes, feature, gene_type, gene_level, exon_id, exon_number, transcript_name, transcript_type that overlap with DMRs
# returns amount of overlap as a percentage under gene column as gene-name(overlap %)
df[['gene_name', 'gene_type', 'gene_level', 'gene_id', 'strand']] = df.apply(lambda x: gencode_annotate_loci(x[['chr', 'start', 'end']], gencode_v44_genes), axis=1, result_type='expand')

# replace empty strings with NaN
df = df.replace(r'^\s*$', np.nan, regex=True)

### Observing DMR annotations and comparing to annotation file

In [31]:
annot_df_gene.head()
df.head()

Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,delta_means,delta_quants,delta_maxmin,ttest,direction,gene_name,gene_type,gene_level,gene_id,strand
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,0.413,0.372,0.33,0.126,U,ENSG00000238009(100.0)(0.94),lncRNA,2,ENSG00000238009.6,-
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,0.407,0.36,0.314,0.146,U,ENSG00000238009(100.0)(1.05),lncRNA,2,ENSG00000238009.6,-
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,0.367,0.356,0.344,0.0761,U,ENSG00000238009(100.0)(0.46);CICP27(100.0)(5.41),lncRNA;processed_pseudogene,2;1,ENSG00000238009.6;ENSG00000233750.3,-;+
3,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,0.389,0.382,0.375,0.164,U,ENSG00000228463(100.0)(1.78),transcribed_processed_pseudogene,1,ENSG00000228463.10,-
4,chr1,527011,528180,4884,4906,colon_cancer,chr1:527011-528180,22,1169,0.435,...,0.44,0.407,0.375,0.0927,U,NA(0)(0),,,,


In [32]:
# checking proper annotation (uncomment below lines to check different things)

# # checking by chromosome
# display(df.loc[(df['chr']=='chr4')])
# display(annot_df_gene.loc[(annot_df_gene['chr']=='chr4')])

# # checking for specific genes
# display(df.loc[(annot_df['gene_name']=='SEPTIN9') & (df['chr']=='chr17')])
# display(annot_df.loc[(annot_df['gene_name']=='SEPTIN9') & (annot_df['chr']=='chr17')])

# # checking by chromosome and start/end
# df_check = display(df.loc[(dmr['chr']=='chr17') & (df['start'].between(77280569,77500596))])
# annot_check = display(annot_df.loc[(annot_df['chr']=='chr17') & (annot_df['start'].between(77280569,77500596))])


### Creating a variable for intergenic or intragenic classification

In [33]:
# Function to return if region is intergenic or intragenic
def genic(gene):
    if gene == 'NA(0)(0)':
        return "intergenic"
    else:
        return "intragenic"
    
# apply genic function to df
df['genic'] = df['gene_name'].apply(genic)

### Count genes per region and create gene_num

In [34]:
# Function to count the number of genes found for each region 
# by counting the number of semicolons in the gene_name column
def count_genes(gene):
    if gene == 'NA(0)(0)':
        return 0
    else:
        return gene.count(";") + 1

In [35]:
# count the number of genes found for each region and add to g_nums list
g_nums = []
for i,r in df.iterrows():
    g_num = count_genes(str(r['gene_name']))
    g_nums.append(g_num)

# add gene number column to dataframe
df['gene_num'] = g_nums
print("Gene counts added to dataframe.")

# return max number of genes found in one region and preview of regions with max number of genes
max_g = max(g_nums)
print("Max number of genes in one region is {}.".format(max_g))
print("Regions with max number of genes:")
display(df.loc[(df['gene_num']==max_g)])

Gene counts added to dataframe.
Max number of genes in one region is 22.
Regions with max number of genes:


Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,delta_maxmin,ttest,direction,gene_name,gene_type,gene_level,gene_id,strand,genic,gene_num
15177,chr5,141508878,141509045,8904372,8904378,colon_cancer,chr5:141508878-141509045,6,167,0.661,...,0.324,0.112,M,PCDHGA1(100.0)(0.09);PCDHGA2(100.0)(0.1);PCDHG...,protein_coding;protein_coding;protein_coding;p...,2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2,ENSG00000204956.6;ENSG00000081853.15;ENSG00000...,+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+,intragenic,22
15178,chr5,141512826,141512943,8904452,8904458,colon_cancer,chr5:141512826-141512943,6,117,0.592,...,0.309,0.999,M,PCDHGA1(100.0)(0.06);PCDHGA2(100.0)(0.07);PCDH...,protein_coding;protein_coding;protein_coding;p...,2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2,ENSG00000204956.6;ENSG00000081853.15;ENSG00000...,+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+,intragenic,22


### Split gene names out for regions with more than one gene

This section creates new rows.

In [36]:
# write a function to split the deliminates gene columns into lists
def split_gene(gene_name, gene_type, gene_level, gene_id, strand):
    gene_list = gene_name.split(";")
    gt_list = str(gene_type).split(";") 
    gl_list = str(gene_level).split(";")
    gid_list = str(gene_id).split(";")
    strand_list = str(strand).split(";")
    return gene_list, gt_list, gl_list, gid_list, strand_list

In [37]:
# split values in the columns gene, gene_type, gene_level by delimiter ; and add to a list of lists called new_rows
new_rows = []
for i,r in df.iterrows():
    g_list, gt_list, gl_list, gid_list, strand_list = split_gene(r['gene_name'], r['gene_type'], r['gene_level'], r['gene_id'], r['strand'])

    for g in range(len(g_list)):
        new_rows.append(np.append(r[['chr', 'start', 'end', 'startCpG', 
                                     'endCpG', 'target', 'region', 'lenCpG', 
                                     'bp', 'tg_mean', 'bg_mean', 'delta_means', 
                                     'delta_quants', 'delta_maxmin', 'ttest', 
                                     'direction', 'gene_name', 'gene_type',
                                     'gene_level', 'gene_id', 'strand', 'gene_num']
                                     ].values, [g_list[g], gt_list[g], gl_list[g], 
                                                gid_list[g], strand_list[g]
                                                ]))

In [38]:
df.head()

Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,delta_maxmin,ttest,direction,gene_name,gene_type,gene_level,gene_id,strand,genic,gene_num
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,0.33,0.126,U,ENSG00000238009(100.0)(0.94),lncRNA,2,ENSG00000238009.6,-,intragenic,1
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,0.314,0.146,U,ENSG00000238009(100.0)(1.05),lncRNA,2,ENSG00000238009.6,-,intragenic,1
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,0.344,0.0761,U,ENSG00000238009(100.0)(0.46);CICP27(100.0)(5.41),lncRNA;processed_pseudogene,2;1,ENSG00000238009.6;ENSG00000233750.3,-;+,intragenic,2
3,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,0.375,0.164,U,ENSG00000228463(100.0)(1.78),transcribed_processed_pseudogene,1,ENSG00000228463.10,-,intragenic,1
4,chr1,527011,528180,4884,4906,colon_cancer,chr1:527011-528180,22,1169,0.435,...,0.375,0.0927,U,NA(0)(0),,,,,intergenic,0


In [39]:
print(len(new_rows))
print(new_rows[0:5])

50317
[array(['chr1', 92977, 93393, 1045, 1051, 'colon_cancer',
       'chr1:92977-93393', '6', '416', 0.311, 0.724, 0.413, 0.372, 0.33,
       0.126, 'U', 'ENSG00000238009(100.0)(0.94)', 'lncRNA', '2',
       'ENSG00000238009.6', '-', 1, 'ENSG00000238009(100.0)(0.94)',
       'lncRNA', '2', 'ENSG00000238009.6', '-'], dtype=object), array(['chr1', 99252, 99720, 1094, 1101, 'colon_cancer',
       'chr1:99252-99720', '7', '468', 0.26, 0.667, 0.407, 0.36, 0.314,
       0.146, 'U', 'ENSG00000238009(100.0)(1.05)', 'lncRNA', '2',
       'ENSG00000238009.6', '-', 1, 'ENSG00000238009(100.0)(1.05)',
       'lncRNA', '2', 'ENSG00000238009.6', '-'], dtype=object), array(['chr1', 132975, 133181, 1336, 1344, 'colon_cancer',
       'chr1:132975-133181', '8', '206', 0.633, 1.0, 0.367, 0.356, 0.344,
       0.0761, 'U', 'ENSG00000238009(100.0)(0.46);CICP27(100.0)(5.41)',
       'lncRNA;processed_pseudogene', '2;1',
       'ENSG00000238009.6;ENSG00000233750.3', '-;+', 2,
       'ENSG00000238009(100.0)(0

In [40]:
# Add new rows to add to the dataframe
df_perGene = pd.DataFrame(new_rows, columns=['chr', 'start', 'end', 'startCpG',
                                            'endCpG', 'target', 'region', 'lenCpG',
                                            'bp', 'tg_mean', 'bg_mean', 'delta_means',
                                            'delta_quants', 'delta_maxmin', 'ttest', 
                                            'direction', 'gene_name', 'gene_type',
                                            'gene_level', 'gene_id', 'strand', 
                                            'gene_num', 'gene_name_split', 
                                            'gene_type_split', 'gene_level_split',
                                            'gene_id_split', 'strand_split'])

In [41]:
df_perGene.head()

Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,gene_type,gene_level,gene_id,strand,gene_num,gene_name_split,gene_type_split,gene_level_split,gene_id_split,strand_split
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,lncRNA,2,ENSG00000238009.6,-,1,ENSG00000238009(100.0)(0.94),lncRNA,2,ENSG00000238009.6,-
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,lncRNA,2,ENSG00000238009.6,-,1,ENSG00000238009(100.0)(1.05),lncRNA,2,ENSG00000238009.6,-
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,lncRNA;processed_pseudogene,2;1,ENSG00000238009.6;ENSG00000233750.3,-;+,2,ENSG00000238009(100.0)(0.46),lncRNA,2,ENSG00000238009.6,-
3,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,lncRNA;processed_pseudogene,2;1,ENSG00000238009.6;ENSG00000233750.3,-;+,2,CICP27(100.0)(5.41),processed_pseudogene,1,ENSG00000233750.3,+
4,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,transcribed_processed_pseudogene,1,ENSG00000228463.10,-,1,ENSG00000228463(100.0)(1.78),transcribed_processed_pseudogene,1,ENSG00000228463.10,-


### Split out coverage percentage from gene names

In [42]:
# Split gene_name column into gene_name, region_coverage, gene_coverage
df_perGene['gene_name_cleaned'] = df_perGene['gene_name_split'].apply(lambda x: x.split("(")[0].replace(")", ""))
df_perGene['region_coverage'] = df_perGene['gene_name_split'].apply(lambda x: x.split("(")[1].replace(")", ""))
df_perGene['gene_coverage'] = df_perGene['gene_name_split'].apply(lambda x: x.split("(")[2].replace(")", ""))

df_perGene.head(10)

Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,strand,gene_num,gene_name_split,gene_type_split,gene_level_split,gene_id_split,strand_split,gene_name_cleaned,region_coverage,gene_coverage
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,-,1,ENSG00000238009(100.0)(0.94),lncRNA,2.0,ENSG00000238009.6,-,ENSG00000238009,100.0,0.94
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,-,1,ENSG00000238009(100.0)(1.05),lncRNA,2.0,ENSG00000238009.6,-,ENSG00000238009,100.0,1.05
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,-;+,2,ENSG00000238009(100.0)(0.46),lncRNA,2.0,ENSG00000238009.6,-,ENSG00000238009,100.0,0.46
3,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,-;+,2,CICP27(100.0)(5.41),processed_pseudogene,1.0,ENSG00000233750.3,+,CICP27,100.0,5.41
4,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,-,1,ENSG00000228463(100.0)(1.78),transcribed_processed_pseudogene,1.0,ENSG00000228463.10,-,ENSG00000228463,100.0,1.78
5,chr1,527011,528180,4884,4906,colon_cancer,chr1:527011-528180,22,1169,0.435,...,,0,NA(0)(0),,,,,,0.0,0.0
6,chr1,588665,588858,5055,5062,colon_cancer,chr1:588665-588858,7,193,0.122,...,-;+,2,ENSG00000230021(100.0)(0.08),transcribed_processed_pseudogene,2.0,ENSG00000230021.10,-,ENSG00000230021,100.0,0.08
7,chr1,588665,588858,5055,5062,colon_cancer,chr1:588665-588858,7,193,0.122,...,-;+,2,ENSG00000235146(100.0)(2.7),lncRNA,2.0,ENSG00000235146.2,+,ENSG00000235146,100.0,2.7
8,chr1,598963,599165,5342,5347,colon_cancer,chr1:598963-599165,5,202,0.293,...,-,1,ENSG00000230021(100.0)(0.08),transcribed_processed_pseudogene,2.0,ENSG00000230021.10,-,ENSG00000230021,100.0,0.08
9,chr1,603632,604015,5406,5412,colon_cancer,chr1:603632-604015,6,383,0.3,...,-,1,ENSG00000230021(100.0)(0.16),transcribed_processed_pseudogene,2.0,ENSG00000230021.10,-,ENSG00000230021,100.0,0.16


In [43]:
df_perGene.columns

Index(['chr', 'start', 'end', 'startCpG', 'endCpG', 'target', 'region',
       'lenCpG', 'bp', 'tg_mean', 'bg_mean', 'delta_means', 'delta_quants',
       'delta_maxmin', 'ttest', 'direction', 'gene_name', 'gene_type',
       'gene_level', 'gene_id', 'strand', 'gene_num', 'gene_name_split',
       'gene_type_split', 'gene_level_split', 'gene_id_split', 'strand_split',
       'gene_name_cleaned', 'region_coverage', 'gene_coverage'],
      dtype='object')

### Clean and reorganize columns if df

In [44]:
# once I have confirmed proper annotation, I can clean up the df
# columns in df_per_gene: ['chr', 'start', 'end', 'startCpG', 'endCpG', 'target', 'region',
                    #    'lenCpG', 'bp', 'tg_mean', 'bg_mean', 'delta_means', 'delta_quants',
                    #    'delta_maxmin', 'ttest', 'direction', 'gene_name', 'gene_type',
                    #    'gene_level', 'gene_id', 'strand', 'gene_num', 'gene_name_split',
                    #    'gene_type_split', 'gene_level_split', 'gene_id_split', 'strand_split',
                    #    'gene_name_cleaned', 'region_coverage', 'gene_coverage']

## drop the genes column
df_perGene = df_perGene.drop(['gene_name', 'gene_type','gene_level', 
                              'gene_id', 'strand', 'gene_name_split'], axis=1)

# rename columns
df_perGene.rename(columns = {'gene_name_cleaned':'gene_name', 
                              'gene_id_split':'gene_ID',
                              'strand_split':'strand',
                              'gene_type_split':'gene_type',
                              'gene_level_split':'gene_level',
                              'gene_num':'num_genes',
                              'gene_coverage':'prop_gene_cov_region',
                              'region_coverage':'prop_region_cov_gene'
                              }, inplace = True)

# reorder columns
df_perGene = df_perGene.loc[:,['chr', 'start', 'end', 'startCpG', 
                                 'endCpG', 'target', 'region', 'lenCpG', 
                                 'bp', 'tg_mean', 'bg_mean', 'delta_means', 
                                 'delta_quants', 'delta_maxmin', 'ttest', 
                                 'direction', 'num_genes', 'gene_name', 
                                 'prop_gene_cov_region', 'prop_region_cov_gene',
                                 'gene_type', 'gene_level', 'gene_ID', 'strand']]

df_perGene.head()

Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,ttest,direction,num_genes,gene_name,prop_gene_cov_region,prop_region_cov_gene,gene_type,gene_level,gene_ID,strand
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,0.126,U,1,ENSG00000238009,0.94,100.0,lncRNA,2,ENSG00000238009.6,-
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,0.146,U,1,ENSG00000238009,1.05,100.0,lncRNA,2,ENSG00000238009.6,-
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,0.0761,U,2,ENSG00000238009,0.46,100.0,lncRNA,2,ENSG00000238009.6,-
3,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,0.0761,U,2,CICP27,5.41,100.0,processed_pseudogene,1,ENSG00000233750.3,+
4,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,0.164,U,1,ENSG00000228463,1.78,100.0,transcribed_processed_pseudogene,1,ENSG00000228463.10,-


In [45]:
# create dataframe with only significant DMRs
dmr = df_perGene.loc[(df_perGene['ttest'] < 0.05)]

In [46]:
df.head()

Unnamed: 0,chr,start,end,startCpG,endCpG,target,region,lenCpG,bp,tg_mean,...,delta_maxmin,ttest,direction,gene_name,gene_type,gene_level,gene_id,strand,genic,gene_num
0,chr1,92977,93393,1045,1051,colon_cancer,chr1:92977-93393,6,416,0.311,...,0.33,0.126,U,ENSG00000238009(100.0)(0.94),lncRNA,2,ENSG00000238009.6,-,intragenic,1
1,chr1,99252,99720,1094,1101,colon_cancer,chr1:99252-99720,7,468,0.26,...,0.314,0.146,U,ENSG00000238009(100.0)(1.05),lncRNA,2,ENSG00000238009.6,-,intragenic,1
2,chr1,132975,133181,1336,1344,colon_cancer,chr1:132975-133181,8,206,0.633,...,0.344,0.0761,U,ENSG00000238009(100.0)(0.46);CICP27(100.0)(5.41),lncRNA;processed_pseudogene,2;1,ENSG00000238009.6;ENSG00000233750.3,-;+,intragenic,2
3,chr1,275815,277626,2790,2801,colon_cancer,chr1:275815-277626,11,1811,0.236,...,0.375,0.164,U,ENSG00000228463(100.0)(1.78),transcribed_processed_pseudogene,1,ENSG00000228463.10,-,intragenic,1
4,chr1,527011,528180,4884,4906,colon_cancer,chr1:527011-528180,22,1169,0.435,...,0.375,0.0927,U,NA(0)(0),,,,,intergenic,0


### Save files

df has columns: ['chr', 'start', 'end', 'startCpG', 'endCpG', 'target', 'region',
                'lenCpG', 'bp', 'tg_mean', 'bg_mean', 'delta_means', 'delta_quants',
                'delta_maxmin', 'ttest', 'direction', 'gene_name', 'gene_type',
                'gene_level', 'gene_id', 'strand', 'gene_num', 'genic']

In [47]:
df_perGene.to_csv(output_file_per_gene, index=False, header=True, sep="\t")
df.to_csv(output_file, index=False, header=True, sep="\t")
dmr.to_csv(dmr_output_file, index=False, header=True, sep="\t")

print(output_file + " with all regions (partial annotation/no dups) has been created")
print(output_file_per_gene + " with all regions (dup regions to split genes) has been created")
print(dmr_output_file + " with only significant regions (dup regions with annotations) has been created")

/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers_annotated.tsv with all regions (partial annotation/no dups) has been created
/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers_annotated_per_gene.tsv with all regions (dup regions to split genes) has been created
/Users/meghansleeper/Desktop/MATH615/data/colon_cancer_markers_annotated_dmrs.tsv with only significant regions (dup regions with annotations) has been created


## Misc. code chunks from dev

In [None]:
# # display dmr dataframe where gene_num is greater than 1
# df.loc[(df['gene_num'] > 1)]

# # return rows where feature is exon in df_perGene
# df_perGene[df_perGene.feature=='exon'].tail(25)

## skip if you want to keep regions with no genes
## Transforms the data frame and removes anything that returned NA
## Remove all the intervals that do not overlap with genes
# df = df[df['genes'] != "NA(0)"].reset_index(drop=True)

# # display items in list of lists
# print(new_rows[0:5])

# # listing column names
# column_names = list(df.columns.values)
# print(column_names)

# # exporting top 100 regions with lowest p-values for annotation in browser (USC Genome Browser has gencode V44 annotation)
# df.sort_values(by=['ttest'], inplace=True)
# dmr_exp = df.loc[(df['ttest']<0.050)].head(100)
# dmr_exp = dmr_exp.drop(['startCpG', 'endCpG', 'target', 'bp',
#                    'lenCpG', 'region', 'tg_mean', 'bg_mean', 
#                    'delta_means', 'delta_quants', 'delta_maxmin',
#                    'ttest', 'direction'], axis=1).head(100)
# dmr_exp.to_csv('top100_colon_markers.tsv', sep="\t", header=False, index=False) 
# dmr_exp.head()

## Questions for keller

What do the various chromosome labels mean? 
Are these contigs that are not included in chromosome contructed references?
    Does `M` refer to mitrochondrial?

In [8]:
# figure out how to interpret the chromosome labels
# https://genome.ucsc.edu/cgi-bin/hgTracks?chromInfoPage=
annot_df.chr.unique()

array(['GL000009.2', 'GL000194.1', 'GL000195.1', 'GL000205.2',
       'GL000209.2', 'GL000213.1', 'GL000216.2', 'GL000218.1',
       'GL000219.1', 'GL000220.1', 'GL000225.1', 'GL000250.2',
       'GL000251.2', 'GL000252.2', 'GL000253.2', 'GL000254.2',
       'GL000255.2', 'GL000256.2', 'GL000257.2', 'GL000258.2',
       'GL339449.2', 'GL383518.1', 'GL383519.1', 'GL383520.2',
       'GL383521.1', 'GL383522.1', 'GL383526.1', 'GL383527.1',
       'GL383528.1', 'GL383531.1', 'GL383532.1', 'GL383533.1',
       'GL383534.2', 'GL383539.1', 'GL383540.1', 'GL383541.1',
       'GL383542.1', 'GL383545.1', 'GL383546.1', 'GL383550.2',
       'GL383551.1', 'GL383552.1', 'GL383553.2', 'GL383554.1',
       'GL383555.2', 'GL383556.1', 'GL383557.1', 'GL383563.3',
       'GL383564.2', 'GL383565.1', 'GL383566.1', 'GL383567.1',
       'GL383569.1', 'GL383570.1', 'GL383571.1', 'GL383572.1',
       'GL383573.1', 'GL383574.1', 'GL383575.2', 'GL383576.1',
       'GL383577.2', 'GL383579.2', 'GL383580.2', 'GL383