# Feature summary analysis of gender only differential expression

In [1]:
import numpy as np
import pandas as pd

## Summary plots

In [11]:
def subset_deg(fn, feature):
    feature_dict = {
        'genes': 'Gene', 
        'transcripts': 'Transcript', 
        'exons': 'Exon', 
        'junctions': 'Junction'}
    df = pd.read_csv(fn, sep='\t', index_col=0)
    if feature == 'transcripts':
        df['Feature'] = df.index
        df['ensemblID'] = df.gene_id.str.replace('\\.\d+', '', regex=True)
        df.rename(columns={'gene_name': 'Symbol'}, inplace=True)
    elif feature == 'genes':
        df['Feature'] = df.gencodeID
    else:
        df['Feature'] = df.index
    df = df[['Feature', 'Symbol', 'ensemblID', 'logFC', 'adj.P.Val']].copy()
    df['Type'] = feature_dict[feature]
    return df


def extract_deg(feature):
    config = {
        'CvD': '../../_m/%s/diffExpr_CvD_sex_FDR05.txt' % feature, 
        'CvH': '../../_m/%s/diffExpr_CvH_sex_FDR05.txt' % feature,
        'DvH': '../../_m/%s/diffExpr_DvH_sex_FDR05.txt' % feature,
    }
    cvd = subset_deg(config['CvD'], feature)
    cvd['Comparison'] = 'CvD'
    cvh = subset_deg(config['CvH'], feature)
    cvh['Comparison'] = 'CvH'
    dvh = subset_deg(config['DvH'], feature)
    dvh['Comparison'] = 'DvH'
    return cvd, cvh, dvh

### Genes

In [3]:
feature = 'genes'
genes_cvd, genes_cvh, genes_dvh = extract_deg(feature)
genes = pd.concat([genes_cvd, genes_cvh, genes_dvh])
genes.head(2)

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
ENSG00000184991.2,ENSG00000184991.2,TTTY13,ENSG00000184991,-4.360657,5.00519e-133,Gene,CvD
ENSG00000092377.13,ENSG00000092377.13,TBL1Y,ENSG00000092377,-3.627633,6.609422e-76,Gene,CvD


In [4]:
shared = set(genes_cvd.Feature) & set(genes_cvh.Feature)
print("There are %d genes that interact for caudate between DLPFC and Hippocampus!" % len(shared))
genes_cvd.set_index('Feature').loc[np.array(list(shared)), :]

There are 17 genes that interact for caudate between DLPFC and Hippocampus!


Unnamed: 0_level_0,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000228223.2,HCG11,ENSG00000228223,0.131063,0.04230495,Gene,CvD
ENSG00000185275.6,CD24P4,ENSG00000185275,1.269074,1.052913e-06,Gene,CvD
ENSG00000066629.16,EML1,ENSG00000066629,-0.222427,0.0001412821,Gene,CvD
ENSG00000092377.13,TBL1Y,ENSG00000092377,-3.627633,6.609422e-76,Gene,CvD
ENSG00000173809.16,TDRD12,ENSG00000173809,-0.462759,0.001872604,Gene,CvD
ENSG00000277438.1,,ENSG00000277438,0.919121,2.146899e-07,Gene,CvD
ENSG00000184991.2,TTTY13,ENSG00000184991,-4.360657,5.00519e-133,Gene,CvD
ENSG00000252766.1,RNU6-255P,ENSG00000252766,1.35357,1.141351e-13,Gene,CvD
ENSG00000232348.2,LINC00279,ENSG00000232348,-1.334907,5.75072e-13,Gene,CvD
ENSG00000099715.14,PCDH11Y,ENSG00000099715,2.51974,3.588357e-32,Gene,CvD


In [5]:
shared = set(genes_cvd.Feature) & set(genes_dvh.Feature)
print("There are %d genes that interact for DLPFC between caudate and Hippocampus!" % len(shared))
genes_cvd.set_index('Feature').loc[np.array(list(shared)), :]

There are 20 genes that interact for DLPFC between caudate and Hippocampus!


Unnamed: 0_level_0,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000185275.6,CD24P4,ENSG00000185275,1.269074,1.052913e-06,Gene,CvD
ENSG00000129824.15,RPS4Y1,ENSG00000129824,0.809539,0.003393715,Gene,CvD
ENSG00000176728.7,TTTY14,ENSG00000176728,0.55917,0.03879491,Gene,CvD
ENSG00000067048.16,DDX3Y,ENSG00000067048,0.976405,3.043073e-05,Gene,CvD
ENSG00000099725.14,PRKY,ENSG00000099725,0.91271,0.0001195206,Gene,CvD
ENSG00000131002.11,TXLNGY,ENSG00000131002,0.843662,0.003393715,Gene,CvD
ENSG00000239893.1,ZNF736P9Y,ENSG00000239893,-3.128749,3.952339e-33,Gene,CvD
ENSG00000260197.1,,ENSG00000260197,0.604253,0.006546653,Gene,CvD
ENSG00000151923.17,TIAL1,ENSG00000151923,-0.090797,0.03656674,Gene,CvD
ENSG00000012817.15,KDM5D,ENSG00000012817,1.018661,4.159236e-06,Gene,CvD


In [6]:
shared = set(genes_cvh.Feature) & set(genes_dvh.Feature)
print("There are %d genes that interact for Hippocampus between caudate and DLPFC!" % len(shared))
genes_cvh.set_index('Feature').loc[np.array(list(shared)), :]

There are 3 genes that interact for Hippocampus between caudate and DLPFC!


Unnamed: 0_level_0,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000273906.1,,ENSG00000273906,-2.789483,4.378328e-45,Gene,CvH
ENSG00000185275.6,CD24P4,ENSG00000185275,-0.8377,0.0006893577,Gene,CvH
ENSG00000092377.13,TBL1Y,ENSG00000092377,-2.825989,6.2755290000000005e-52,Gene,CvH


### Transcripts

In [9]:
feature = 'transcripts'
trans_cvd, trans_cvh, trans_dvh = extract_deg(feature)
trans = pd.concat([trans_cvd, trans_cvh, trans_dvh])
trans.head(2)

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
ENST00000418290.1,ENST00000418290.1,CDY4P,ENSG00000228411,-5.77571,6.693728e-291,Transcript,CvD
ENST00000400275.2,ENST00000400275.2,PSMA6P1,ENSG00000215414,-5.64374,1.051031e-288,Transcript,CvD


### Exons

In [12]:
feature = 'exons'
exons_cvd, exons_cvh, exons_dvh = extract_deg(feature)
exons = pd.concat([exons_cvd, exons_cvh, exons_dvh])
exons.head(2)

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
e1181198,e1181198,TTTY13,ENSG00000184991,-4.158493,2.766886e-127,Exon,CvD
e1179056,e1179056,TBL1Y,ENSG00000092377,-4.247636,1.583091e-117,Exon,CvD


### Junctions

In [13]:
feature = 'junctions'
juncs_cvd, juncs_cvh, juncs_dvh = extract_deg(feature)
juncs = pd.concat([juncs_cvd, juncs_cvh, juncs_dvh])
juncs.head(2)

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
chrY:5002759-5031905(+),chrY:5002759-5031905(+),PCDH11Y,ENSG00000099715,2.978971,1.7255700000000002e-32,Junction,CvD
chrY:5100708-5104343(+),chrY:5100708-5104343(+),PCDH11Y,ENSG00000099715,3.06875,2.870987e-30,Junction,CvD


### Merge data frame

In [14]:
df = pd.concat([genes, trans, exons, juncs], axis=0)
df.to_csv('differential_expression_region_interaction_sex_4features.txt', sep='\t', index=False)
df.shape

(4507, 7)

## DE summary

### Caudate and DLPFC

#### DE (feature)

In [15]:
gg = len(set(genes_cvd['Feature']))
tt = len(set(trans_cvd['Feature']))
ee = len(set(exons_cvd['Feature']))
jj = len(set(juncs_cvd['Feature']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		668
Transcript:	642
Exon:		2254
Junction:	324


#### DE (EnsemblID)

In [16]:
gg = len(set(genes_cvd['ensemblID']))
tt = len(set(trans_cvd['ensemblID']))
ee = len(set(exons_cvd['ensemblID']))
jj = len(set(juncs_cvd['ensemblID']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		668
Transcript:	569
Exon:		649
Junction:	147


#### DE (Gene Symbol)

In [17]:
gg = len(set(genes_cvd['Symbol']))
tt = len(set(trans_cvd['Symbol']))
ee = len(set(exons_cvd['Symbol']))
jj = len(set(juncs_cvd['Symbol']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		564
Transcript:	569
Exon:		597
Junction:	147


### Caudate and Hippocampus

#### DE (feature)

In [18]:
gg = len(set(genes_cvh['Feature']))
tt = len(set(trans_cvh['Feature']))
ee = len(set(exons_cvh['Feature']))
jj = len(set(juncs_cvh['Feature']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		23
Transcript:	177
Exon:		129
Junction:	47


#### DE (EnsemblID)

In [19]:
gg = len(set(genes_cvh['ensemblID']))
tt = len(set(trans_cvh['ensemblID']))
ee = len(set(exons_cvh['ensemblID']))
jj = len(set(juncs_cvh['ensemblID']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		23
Transcript:	129
Exon:		38
Junction:	17


#### DE (Gene Symbol)

In [20]:
gg = len(set(genes_cvh['Symbol']))
tt = len(set(trans_cvh['Symbol']))
ee = len(set(exons_cvh['Symbol']))
jj = len(set(juncs_cvh['Symbol']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		19
Transcript:	129
Exon:		36
Junction:	17


### DLPFC and Hippocampus

#### DE (feature)

In [21]:
gg = len(set(genes_dvh['Feature']))
tt = len(set(trans_dvh['Feature']))
ee = len(set(exons_dvh['Feature']))
jj = len(set(juncs_dvh['Feature']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		26
Transcript:	28
Exon:		131
Junction:	58


#### DE (EnsemblID)

In [22]:
gg = len(set(genes_dvh['ensemblID']))
tt = len(set(trans_dvh['ensemblID']))
ee = len(set(exons_dvh['ensemblID']))
jj = len(set(juncs_dvh['ensemblID']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		26
Transcript:	19
Exon:		24
Junction:	16


#### DE (Gene Symbol)

In [23]:
gg = len(set(genes_dvh['Symbol']))
tt = len(set(trans_dvh['Symbol']))
ee = len(set(exons_dvh['Symbol']))
jj = len(set(juncs_dvh['Symbol']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		23
Transcript:	19
Exon:		23
Junction:	16
