# Feature summary analysis of gender only differential expression

In [1]:
import numpy as np
import pandas as pd

## Summary plots

In [2]:
def subset_deg(fn, feature):
    feature_dict = {
        'genes': 'Gene', 
        'transcripts': 'Transcript', 
        'exons': 'Exon', 
        'junctions': 'Junction'}
    df = pd.read_csv(fn, sep='\t')
    if feature == 'transcripts':
        df['Feature'] = df.transcript_id
        df['ensemblID'] = df.gene_id.str.replace('\\.\d+', '')
        df.rename(columns={'gene_name': 'Symbol'}, inplace=True)
    elif feature == 'genes':
        df['Feature'] = df.gencodeID
    else:
        df['Feature'] = df['Unnamed: 0']
    df = df[['Feature', 'Symbol', 'ensemblID', 'logFC', 'adj.P.Val']].copy()
    df['Type'] = feature_dict[feature]
    return df


def extract_deg(feature):
    config = {
        'CvD': '../../_m/%s/diffExpr_CvD_sex_FDR05.txt' % feature, 
        'CvH': '../../_m/%s/diffExpr_CvH_sex_FDR05.txt' % feature,
        'DvH': '../../_m/%s/diffExpr_DvH_sex_FDR05.txt' % feature,
    }
    cvd = subset_deg(config['CvD'], feature)
    cvd['Comparison'] = 'CvD'
    cvh = subset_deg(config['CvH'], feature)
    cvh['Comparison'] = 'CvH'
    dvh = subset_deg(config['DvH'], feature)
    dvh['Comparison'] = 'DvH'
    return cvd, cvh, dvh

### Genes

In [3]:
feature = 'genes'
genes_cvd, genes_cvh, genes_dvh = extract_deg(feature)
genes = pd.concat([genes_cvd, genes_cvh, genes_dvh])
genes.head(2)

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
0,ENSG00000184991.2,TTTY13,ENSG00000184991,-4.413303,7.363061999999999e-135,Gene,CvD
1,ENSG00000092377.13,TBL1Y,ENSG00000092377,-3.622407,3.4772210000000005e-75,Gene,CvD


In [19]:
shared = set(genes_cvd.Feature) & set(genes_cvh.Feature)
print("There are %d genes that interact for caudate between DLPFC and Hippocampus!" % len(shared))
genes_cvd.set_index('Feature').loc[np.array(list(shared)), :]

There are 17 genes that interact for caudate between DLPFC and Hippocampus!


Unnamed: 0_level_0,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000277438.1,,ENSG00000277438,0.904521,4.667421e-07,Gene,CvD
ENSG00000006757.11,PNPLA4,ENSG00000006757,0.332948,5.803679e-16,Gene,CvD
ENSG00000273906.1,,ENSG00000273906,-3.570382,8.796435000000001e-66,Gene,CvD
ENSG00000185275.6,CD24P4,ENSG00000185275,1.31107,3.355042e-07,Gene,CvD
ENSG00000252766.1,RNU6-255P,ENSG00000252766,1.355991,1.070528e-13,Gene,CvD
ENSG00000228223.2,HCG11,ENSG00000228223,0.134635,0.04923225,Gene,CvD
ENSG00000173809.16,TDRD12,ENSG00000173809,-0.435453,0.005884162,Gene,CvD
ENSG00000099715.14,PCDH11Y,ENSG00000099715,2.472148,5.569986e-31,Gene,CvD
ENSG00000197181.11,PIWIL2,ENSG00000197181,-0.360354,0.02658008,Gene,CvD
ENSG00000146674.14,IGFBP3,ENSG00000146674,0.92783,7.779732e-12,Gene,CvD


In [20]:
shared = set(genes_cvd.Feature) & set(genes_dvh.Feature)
print("There are %d genes that interact for DLPFC between caudate and Hippocampus!" % len(shared))
genes_cvd.set_index('Feature').loc[np.array(list(shared)), :]

There are 19 genes that interact for DLPFC between caudate and Hippocampus!


Unnamed: 0_level_0,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000006757.11,PNPLA4,ENSG00000006757,0.332948,5.803679e-16,Gene,CvD
ENSG00000234449.2,,ENSG00000234449,0.37042,0.005161683,Gene,CvD
ENSG00000099725.14,PRKY,ENSG00000099725,0.919332,0.0001123099,Gene,CvD
ENSG00000012817.15,KDM5D,ENSG00000012817,1.016437,4.800939e-06,Gene,CvD
ENSG00000131002.11,TXLNGY,ENSG00000131002,0.834063,0.004644665,Gene,CvD
ENSG00000239893.1,ZNF736P9Y,ENSG00000239893,-3.170469,8.255369e-34,Gene,CvD
ENSG00000227494.2,USP9YP14,ENSG00000227494,1.410983,3.137787e-12,Gene,CvD
ENSG00000260197.1,,ENSG00000260197,0.602172,0.009237491,Gene,CvD
ENSG00000273906.1,,ENSG00000273906,-3.570382,8.796435000000001e-66,Gene,CvD
ENSG00000185275.6,CD24P4,ENSG00000185275,1.31107,3.355042e-07,Gene,CvD


In [21]:
shared = set(genes_cvh.Feature) & set(genes_dvh.Feature)
print("There are %d genes that interact for Hippocampus between caudate and DLPFC!" % len(shared))
genes_cvh.set_index('Feature').loc[np.array(list(shared)), :]

There are 4 genes that interact for Hippocampus between caudate and DLPFC!


Unnamed: 0_level_0,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000092377.13,TBL1Y,ENSG00000092377,-2.836381,7.858324e-52,Gene,CvH
ENSG00000006757.11,PNPLA4,ENSG00000006757,0.1581,0.01150521,Gene,CvH
ENSG00000273906.1,,ENSG00000273906,-2.790511,9.669762e-45,Gene,CvH
ENSG00000185275.6,CD24P4,ENSG00000185275,-0.796622,0.002324493,Gene,CvH


### Transcripts

In [4]:
feature = 'transcripts'
trans_cvd, trans_cvh, trans_dvh = extract_deg(feature)
trans = pd.concat([trans_cvd, trans_cvh, trans_dvh])
trans.head(2)

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
0,ENST00000418290.1,CDY4P,ENSG00000228411,-5.80182,1.556539e-291,Transcript,CvD
1,ENST00000400275.2,PSMA6P1,ENSG00000215414,-5.655728,5.081686e-288,Transcript,CvD


### Exons

In [5]:
feature = 'exons'
exons_cvd, exons_cvh, exons_dvh = extract_deg(feature)
exons = pd.concat([exons_cvd, exons_cvh, exons_dvh])
exons.head(2)

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
0,e1181198,TTTY13,ENSG00000184991,-4.212665,1.326308e-129,Exon,CvD
1,e1179056,TBL1Y,ENSG00000092377,-4.239987,2.409323e-116,Exon,CvD


### Junctions

In [6]:
feature = 'junctions'
juncs_cvd, juncs_cvh, juncs_dvh = extract_deg(feature)
juncs = pd.concat([juncs_cvd, juncs_cvh, juncs_dvh])
juncs.head(2)

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type,Comparison
0,chrY:5002759-5031905(+),PCDH11Y,ENSG00000099715,2.939596,2.2634580000000002e-31,Junction,CvD
1,chrY:5100708-5104343(+),PCDH11Y,ENSG00000099715,3.039568,2.4638960000000002e-29,Junction,CvD


### Merge data frame

In [11]:
df = pd.concat([genes, trans, exons, juncs], axis=0)
df.to_csv('region_interaction_4features.txt', sep='\t', index=False)
df.shape

(3116, 7)

## DE summary

### Caudate and DLPFC

#### DE (feature)

In [12]:
gg = len(set(genes_cvd['Feature']))
tt = len(set(trans_cvd['Feature']))
ee = len(set(exons_cvd['Feature']))
jj = len(set(juncs_cvd['Feature']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		421
Transcript:	433
Exon:		1451
Junction:	208


#### DE (EnsemblID)

In [13]:
gg = len(set(genes_cvd['ensemblID']))
tt = len(set(trans_cvd['ensemblID']))
ee = len(set(exons_cvd['ensemblID']))
jj = len(set(juncs_cvd['ensemblID']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		421
Transcript:	377
Exon:		447
Junction:	92


#### DE (Gene Symbol)

In [14]:
gg = len(set(genes_cvd['Symbol']))
tt = len(set(trans_cvd['Symbol']))
ee = len(set(exons_cvd['Symbol']))
jj = len(set(juncs_cvd['Symbol']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		363
Transcript:	377
Exon:		408
Junction:	92


### Caudate and Hippocampus

#### DE (feature)

In [15]:
gg = len(set(genes_cvh['Feature']))
tt = len(set(trans_cvh['Feature']))
ee = len(set(exons_cvh['Feature']))
jj = len(set(juncs_cvh['Feature']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		22
Transcript:	170
Exon:		121
Junction:	48


#### DE (EnsemblID)

In [16]:
gg = len(set(genes_cvh['ensemblID']))
tt = len(set(trans_cvh['ensemblID']))
ee = len(set(exons_cvh['ensemblID']))
jj = len(set(juncs_cvh['ensemblID']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		22
Transcript:	123
Exon:		35
Junction:	16


#### DE (Gene Symbol)

In [17]:
gg = len(set(genes_cvh['Symbol']))
tt = len(set(trans_cvh['Symbol']))
ee = len(set(exons_cvh['Symbol']))
jj = len(set(juncs_cvh['Symbol']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		18
Transcript:	123
Exon:		33
Junction:	16


### DLPFC and Hippocampus

#### DE (feature)

In [18]:
gg = len(set(genes_dvh['Feature']))
tt = len(set(trans_dvh['Feature']))
ee = len(set(exons_dvh['Feature']))
jj = len(set(juncs_dvh['Feature']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		25
Transcript:	28
Exon:		130
Junction:	59


#### DE (EnsemblID)

In [19]:
gg = len(set(genes_dvh['ensemblID']))
tt = len(set(trans_dvh['ensemblID']))
ee = len(set(exons_dvh['ensemblID']))
jj = len(set(juncs_dvh['ensemblID']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		25
Transcript:	19
Exon:		24
Junction:	16


#### DE (Gene Symbol)

In [20]:
gg = len(set(genes_dvh['Symbol']))
tt = len(set(trans_dvh['Symbol']))
ee = len(set(exons_dvh['Symbol']))
jj = len(set(juncs_dvh['Symbol']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		22
Transcript:	19
Exon:		23
Junction:	16
