# Extract overlapping genes with CMC

In [1]:
import functools
import numpy as np
import pandas as pd
from gtfparse import read_gtf

In [2]:
@functools.lru_cache()
def get_gtf(gtf_file):
    return read_gtf(gtf_file)


In [3]:
def gene_annotation(gtf_file, feature):
    gtf0 = get_gtf(gtf_file)
    gtf = gtf0[gtf0["feature"] == feature]
    return gtf[["gene_id", "gene_name", "gene_type", 
                "seqname", "start", "end", "strand"]]

In [4]:
gtf_file = '/ceph/genome/human/gencode25/gtf.CHR/_m/gencode.v25.annotation.gtf'
gtf_annot = gene_annotation(gtf_file, 'gene')
gtf_annot['ensemblID'] = gtf_annot.gene_id.str.replace("\\..*", "", regex=True)

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_status', 'gene_name', 'level', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_status', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


## Male specific

In [5]:
cmc_file = "../../_m/cmc_all_deg_across_tissues_maleSpecific.csv"
df = gtf_annot.merge(pd.read_csv(cmc_file), on='ensemblID')
df.head(2)

Unnamed: 0.1,gene_id_x,gene_name_x,gene_type_x,seqname_x,start,end,strand,ensemblID,Unnamed: 0,gene_id_y,gene_name_y,seqname_y,gene_type_y,Caudate,DLPFC,Hippocampus,CMC DLPFC
0,ENSG00000272512.1,RP11-54O7.17,lincRNA,chr1,995966,998051,-,ENSG00000272512,1386,ENSG00000272512.1,RP11-54O7.17,chr1,lincRNA,1,0,0,0
1,ENSG00000107404.18,DVL1,protein_coding,chr1,1335276,1349350,-,ENSG00000107404,4087,ENSG00000107404.18,DVL1,chr1,protein_coding,1,0,0,0


### CMC DLPFC overlapping Caudate

In [6]:
df[(df['CMC DLPFC'] == 1) & (df['Caudate'] == 1)]

INFO:numexpr.utils:Note: NumExpr detected 60 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Unnamed: 0.1,gene_id_x,gene_name_x,gene_type_x,seqname_x,start,end,strand,ensemblID,Unnamed: 0,gene_id_y,gene_name_y,seqname_y,gene_type_y,Caudate,DLPFC,Hippocampus,CMC DLPFC
29,ENSG00000189410.11,SH2D5,protein_coding,chr1,20719732,20732837,-,ENSG00000189410,30978,ENSG00000189410.11,SH2D5,chr1,protein_coding,1,0,0,1
50,ENSG00000066185.12,ZMYND12,protein_coding,chr1,42430329,42456267,-,ENSG00000066185,64447,ENSG00000066185.12,ZMYND12,chr1,protein_coding,1,0,0,1
286,ENSG00000115170.13,ACVR1,protein_coding,chr2,157736444,157875862,-,ENSG00000115170,341965,ENSG00000115170.13,ACVR1,chr2,protein_coding,1,0,0,1
1154,ENSG00000260400.1,RP11-119F7.5,sense_overlapping,chr10,68698500,68700794,+,ENSG00000260400,1260519,ENSG00000260400.1,RP11-119F7.5,chr10,sense_overlapping,1,0,0,1
1397,ENSG00000139372.14,TDG,protein_coding,chr12,103965804,103988874,+,ENSG00000139372,1584469,ENSG00000139372.14,TDG,chr12,protein_coding,1,0,0,1
1870,ENSG00000256463.8,SALL3,protein_coding,chr18,78980275,79002677,+,ENSG00000256463,2184098,ENSG00000256463.8,SALL3,chr18,protein_coding,1,0,0,1
2116,ENSG00000100116.16,GCAT,protein_coding,chr22,37807905,37817176,+,ENSG00000100116,2469692,ENSG00000100116.16,GCAT,chr22,protein_coding,1,0,0,1
2127,ENSG00000100266.18,PACSIN2,protein_coding,chr22,42835412,43015145,-,ENSG00000100266,2479888,ENSG00000100266.18,PACSIN2,chr22,protein_coding,1,0,0,1
2193,ENSG00000134597.14,RBMX2,protein_coding,chrX,130401969,130413343,+,ENSG00000134597,2550700,ENSG00000134597.14,RBMX2,chrX,protein_coding,1,0,0,1


### CMC DLPFC overlapping DLPFC

In [7]:
df[(df['CMC DLPFC'] == 1) & (df['DLPFC'] == 1)]

Unnamed: 0.1,gene_id_x,gene_name_x,gene_type_x,seqname_x,start,end,strand,ensemblID,Unnamed: 0,gene_id_y,gene_name_y,seqname_y,gene_type_y,Caudate,DLPFC,Hippocampus,CMC DLPFC
80,ENSG00000171488.14,LRRC8C,protein_coding,chr1,89633072,89769903,+,ENSG00000171488,104025,ENSG00000171488.14,LRRC8C,chr1,protein_coding,0,1,0,1
104,ENSG00000231752.5,EMBP1,transcribed_unprocessed_pseudogene,chr1,121519112,121571892,+,ENSG00000231752,128562,ENSG00000231752.5,EMBP1,chr1,transcribed_unprocessed_pseudogene,0,1,0,1
1537,ENSG00000156414.18,TDRD9,protein_coding,chr14,103928462,104052667,+,ENSG00000156414,1748741,ENSG00000156414.18,TDRD9,chr14,protein_coding,0,1,0,1


### CMC DLPFC overlapping Hippocampus

In [8]:
df[(df['CMC DLPFC'] == 1) & (df['Hippocampus'] == 1)]

Unnamed: 0.1,gene_id_x,gene_name_x,gene_type_x,seqname_x,start,end,strand,ensemblID,Unnamed: 0,gene_id_y,gene_name_y,seqname_y,gene_type_y,Caudate,DLPFC,Hippocampus,CMC DLPFC


### CMC DLPFC overlapping Caudate & DLPFC

In [9]:
df[(df['CMC DLPFC'] == 1) & (df['Caudate'] == 1) & (df['DLPFC'] == 1)]

Unnamed: 0.1,gene_id_x,gene_name_x,gene_type_x,seqname_x,start,end,strand,ensemblID,Unnamed: 0,gene_id_y,gene_name_y,seqname_y,gene_type_y,Caudate,DLPFC,Hippocampus,CMC DLPFC


## Female specific

In [10]:
cmc_file = "../../_m/cmc_all_deg_across_tissues_femaleSpecific.csv"
df = gtf_annot.merge(pd.read_csv(cmc_file), on='ensemblID')
df.head(2)

Unnamed: 0.1,gene_id_x,gene_name_x,gene_type_x,seqname_x,start,end,strand,ensemblID,Unnamed: 0,gene_id_y,gene_name_y,seqname_y,gene_type_y,Caudate,DLPFC,Hippocampus,CMC DLPFC
0,ENSG00000272455.1,RP4-758J18.13,lincRNA,chr1,1409096,1410618,+,ENSG00000272455,4753,ENSG00000272455.1,RP4-758J18.13,chr1,lincRNA,0,0,0,1
1,ENSG00000142609.17,CFAP74,protein_coding,chr1,1921951,2003837,-,ENSG00000142609,7511,ENSG00000142609.17,CFAP74,chr1,protein_coding,0,0,0,1


### CMC DLPFC overlapping Caudate

In [11]:
df[(df['CMC DLPFC'] == 1) & (df['Caudate'] == 1)]

Unnamed: 0.1,gene_id_x,gene_name_x,gene_type_x,seqname_x,start,end,strand,ensemblID,Unnamed: 0,gene_id_y,gene_name_y,seqname_y,gene_type_y,Caudate,DLPFC,Hippocampus,CMC DLPFC
188,ENSG00000249669.8,CARMN,lincRNA,chr5,149406689,149432835,+,ENSG00000249669,770108,ENSG00000249669.8,CARMN,chr5,lincRNA,1,0,0,1
483,ENSG00000167703.14,SLC43A2,protein_coding,chr17,1569267,1628886,-,ENSG00000167703,1979014,ENSG00000167703.14,SLC43A2,chr17,protein_coding,1,0,0,1
514,ENSG00000263006.6,ROCK1P1,transcribed_unprocessed_pseudogene,chr18,109065,122219,+,ENSG00000263006,2140998,ENSG00000263006.6,ROCK1P1,chr18,transcribed_unprocessed_pseudogene,1,0,0,1


### CMC DLPFC overlapping DLPFC

In [12]:
df[(df['CMC DLPFC'] == 1) & (df['DLPFC'] == 1)]

Unnamed: 0.1,gene_id_x,gene_name_x,gene_type_x,seqname_x,start,end,strand,ensemblID,Unnamed: 0,gene_id_y,gene_name_y,seqname_y,gene_type_y,Caudate,DLPFC,Hippocampus,CMC DLPFC


### CMC DLPFC overlapping Hippocampus

In [13]:
df[(df['CMC DLPFC'] == 1) & (df['Hippocampus'] == 1)]

Unnamed: 0.1,gene_id_x,gene_name_x,gene_type_x,seqname_x,start,end,strand,ensemblID,Unnamed: 0,gene_id_y,gene_name_y,seqname_y,gene_type_y,Caudate,DLPFC,Hippocampus,CMC DLPFC


### CMC DLPFC overlapping Caudate & DLPFC

In [14]:
df[(df['CMC DLPFC'] == 1) & (df['Caudate'] == 1) & (df['DLPFC'] == 1)]

Unnamed: 0.1,gene_id_x,gene_name_x,gene_type_x,seqname_x,start,end,strand,ensemblID,Unnamed: 0,gene_id_y,gene_name_y,seqname_y,gene_type_y,Caudate,DLPFC,Hippocampus,CMC DLPFC
