# Extract overlapping genes with CMC

In [1]:
import functools
import numpy as np
import pandas as pd
from gtfparse import read_gtf

In [2]:
@functools.lru_cache()
def get_gtf(gtf_file):
    return read_gtf(gtf_file)


In [3]:
def gene_annotation(gtf_file, feature):
    gtf0 = get_gtf(gtf_file)
    gtf = gtf0[gtf0["feature"] == feature]
    return gtf[["gene_id", "gene_name", "gene_type", 
                "seqname", "start", "end", "strand"]]

In [4]:
gtf_file = '/ceph/genome/human/gencode25/gtf.CHR/_m/gencode.v25.annotation.gtf'
gtf_annot = gene_annotation(gtf_file, 'gene')
gtf_annot['ensemblID'] = gtf_annot.gene_id.str.replace("\\..*", "", regex=True)

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_status', 'gene_name', 'level', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_status', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


## Male specific

In [14]:
cmc_file = "../../_m/cmc_all_deg_across_tissues_maleSpecific.csv"
df = gtf_annot.merge(pd.read_csv(cmc_file), on='ensemblID')
df.head(2)

Unnamed: 0,gene_id,gene_name,gene_type,seqname,start,end,strand,ensemblID,Caudate,DLPFC,Hippocampus,CMC DLPFC
0,ENSG00000107404.18,DVL1,protein_coding,chr1,1335276,1349350,-,ENSG00000107404,1,0,0,0
1,ENSG00000175756.13,AURKAIP1,protein_coding,chr1,1373730,1375495,-,ENSG00000175756,1,0,0,0


### CMC DLPFC overlapping Caudate

In [15]:
df[(df['CMC DLPFC'] == 1) & (df['Caudate'] == 1)]

Unnamed: 0,gene_id,gene_name,gene_type,seqname,start,end,strand,ensemblID,Caudate,DLPFC,Hippocampus,CMC DLPFC
27,ENSG00000189410.11,SH2D5,protein_coding,chr1,20719732,20732837,-,ENSG00000189410,1,0,0,1
47,ENSG00000066185.12,ZMYND12,protein_coding,chr1,42430329,42456267,-,ENSG00000066185,1,0,0,1
116,ENSG00000162747.9,FCGR3B,protein_coding,chr1,161623196,161631963,-,ENSG00000162747,1,1,0,1
561,ENSG00000198833.6,UBE2J1,protein_coding,chr6,89326625,89352848,-,ENSG00000198833,1,0,0,1
584,ENSG00000196569.11,LAMA2,protein_coding,chr6,128883141,129516569,+,ENSG00000196569,1,0,0,1
871,ENSG00000260400.1,RP11-119F7.5,sense_overlapping,chr10,68698500,68700794,+,ENSG00000260400,1,0,0,1
1068,ENSG00000139372.14,TDG,protein_coding,chr12,103965804,103988874,+,ENSG00000139372,1,0,0,1
1280,ENSG00000103485.17,QPRT,protein_coding,chr16,29679008,29698699,+,ENSG00000103485,1,0,0,1
1437,ENSG00000256463.8,SALL3,protein_coding,chr18,78980275,79002677,+,ENSG00000256463,1,0,0,1


### CMC DLPFC overlapping DLPFC

In [16]:
df[(df['CMC DLPFC'] == 1) & (df['DLPFC'] == 1)]

Unnamed: 0,gene_id,gene_name,gene_type,seqname,start,end,strand,ensemblID,Caudate,DLPFC,Hippocampus,CMC DLPFC
31,ENSG00000185436.11,IFNLR1,protein_coding,chr1,24154157,24187959,-,ENSG00000185436,0,1,0,1
74,ENSG00000273487.1,RP4-621B10.8,lincRNA,chr1,92189237,92190707,+,ENSG00000273487,0,1,0,1
89,ENSG00000231752.5,EMBP1,transcribed_unprocessed_pseudogene,chr1,121519112,121571892,+,ENSG00000231752,0,1,0,1
116,ENSG00000162747.9,FCGR3B,protein_coding,chr1,161623196,161631963,-,ENSG00000162747,1,1,0,1
437,ENSG00000145545.11,SRD5A1,protein_coding,chr5,6633343,6674386,+,ENSG00000145545,0,1,0,1


### CMC DLPFC overlapping Hippocampus

In [17]:
df[(df['CMC DLPFC'] == 1) & (df['Hippocampus'] == 1)]

Unnamed: 0,gene_id,gene_name,gene_type,seqname,start,end,strand,ensemblID,Caudate,DLPFC,Hippocampus,CMC DLPFC
382,ENSG00000118579.12,MED28,protein_coding,chr4,17614631,17634105,+,ENSG00000118579,0,0,1,1
670,ENSG00000135245.9,HILPDA,protein_coding,chr7,128455849,128458418,+,ENSG00000135245,0,0,1,1


### CMC DLPFC overlapping Caudate & DLPFC

In [18]:
df[(df['CMC DLPFC'] == 1) & (df['Caudate'] == 1) & (df['DLPFC'] == 1)]

Unnamed: 0,gene_id,gene_name,gene_type,seqname,start,end,strand,ensemblID,Caudate,DLPFC,Hippocampus,CMC DLPFC
116,ENSG00000162747.9,FCGR3B,protein_coding,chr1,161623196,161631963,-,ENSG00000162747,1,1,0,1


## Female specific

In [None]:
cmc_file = "../../_m/cmc_all_deg_across_tissues_femaleSpecific.csv"
df = gtf_annot.merge(pd.read_csv(cmc_file), on='ensemblID')
df.head(2)

### CMC DLPFC overlapping Caudate

In [15]:
df[(df['CMC DLPFC'] == 1) & (df['Caudate'] == 1)]

Unnamed: 0,gene_id,gene_name,gene_type,seqname,start,end,strand,ensemblID,Caudate,DLPFC,Hippocampus,CMC DLPFC
27,ENSG00000189410.11,SH2D5,protein_coding,chr1,20719732,20732837,-,ENSG00000189410,1,0,0,1
47,ENSG00000066185.12,ZMYND12,protein_coding,chr1,42430329,42456267,-,ENSG00000066185,1,0,0,1
116,ENSG00000162747.9,FCGR3B,protein_coding,chr1,161623196,161631963,-,ENSG00000162747,1,1,0,1
561,ENSG00000198833.6,UBE2J1,protein_coding,chr6,89326625,89352848,-,ENSG00000198833,1,0,0,1
584,ENSG00000196569.11,LAMA2,protein_coding,chr6,128883141,129516569,+,ENSG00000196569,1,0,0,1
871,ENSG00000260400.1,RP11-119F7.5,sense_overlapping,chr10,68698500,68700794,+,ENSG00000260400,1,0,0,1
1068,ENSG00000139372.14,TDG,protein_coding,chr12,103965804,103988874,+,ENSG00000139372,1,0,0,1
1280,ENSG00000103485.17,QPRT,protein_coding,chr16,29679008,29698699,+,ENSG00000103485,1,0,0,1
1437,ENSG00000256463.8,SALL3,protein_coding,chr18,78980275,79002677,+,ENSG00000256463,1,0,0,1


### CMC DLPFC overlapping DLPFC

In [16]:
df[(df['CMC DLPFC'] == 1) & (df['DLPFC'] == 1)]

Unnamed: 0,gene_id,gene_name,gene_type,seqname,start,end,strand,ensemblID,Caudate,DLPFC,Hippocampus,CMC DLPFC
31,ENSG00000185436.11,IFNLR1,protein_coding,chr1,24154157,24187959,-,ENSG00000185436,0,1,0,1
74,ENSG00000273487.1,RP4-621B10.8,lincRNA,chr1,92189237,92190707,+,ENSG00000273487,0,1,0,1
89,ENSG00000231752.5,EMBP1,transcribed_unprocessed_pseudogene,chr1,121519112,121571892,+,ENSG00000231752,0,1,0,1
116,ENSG00000162747.9,FCGR3B,protein_coding,chr1,161623196,161631963,-,ENSG00000162747,1,1,0,1
437,ENSG00000145545.11,SRD5A1,protein_coding,chr5,6633343,6674386,+,ENSG00000145545,0,1,0,1


### CMC DLPFC overlapping Hippocampus

In [17]:
df[(df['CMC DLPFC'] == 1) & (df['Hippocampus'] == 1)]

Unnamed: 0,gene_id,gene_name,gene_type,seqname,start,end,strand,ensemblID,Caudate,DLPFC,Hippocampus,CMC DLPFC
382,ENSG00000118579.12,MED28,protein_coding,chr4,17614631,17634105,+,ENSG00000118579,0,0,1,1
670,ENSG00000135245.9,HILPDA,protein_coding,chr7,128455849,128458418,+,ENSG00000135245,0,0,1,1


### CMC DLPFC overlapping Caudate & DLPFC

In [18]:
df[(df['CMC DLPFC'] == 1) & (df['Caudate'] == 1) & (df['DLPFC'] == 1)]

Unnamed: 0,gene_id,gene_name,gene_type,seqname,start,end,strand,ensemblID,Caudate,DLPFC,Hippocampus,CMC DLPFC
116,ENSG00000162747.9,FCGR3B,protein_coding,chr1,161623196,161631963,-,ENSG00000162747,1,1,0,1
