# Feature summary of differential expression analysis

In [1]:
import numpy as np
import pandas as pd

## Summary plots

### Genes

In [2]:
genes = pd.read_csv('../../_m/genes/diffExpr_maleVfemale_full.txt', sep='\t', index_col=0)
genes = genes[(genes['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
genes['Feature'] = genes.index
genes = genes[['Feature', 'Symbol', 'ensemblID', 'logFC', 'adj.P.Val']]
genes['Type'] = 'gene'
genes.head()

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type
ENSG00000229236.1,ENSG00000229236.1,TTTY10,ENSG00000229236,6.919904,5.186692e-243,gene
ENSG00000154620.5,ENSG00000154620.5,TMSB4Y,ENSG00000154620,7.017845,4.9420509999999995e-238,gene
ENSG00000226555.1,ENSG00000226555.1,AGKP1,ENSG00000226555,7.083112,9.807572999999999e-236,gene
ENSG00000176728.7,ENSG00000176728.7,TTTY14,ENSG00000176728,8.090491,4.895668e-231,gene
ENSG00000260197.1,ENSG00000260197.1,,ENSG00000260197,6.302909,8.726814e-229,gene


### Transcripts

In [3]:
trans = pd.read_csv('../../_m/transcripts/diffExpr_maleVfemale_full.txt', sep='\t', index_col=0)
trans = trans[(trans['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
trans['Feature'] = trans.index
trans['ensemblID'] = trans.gene_id.str.replace('\\.\d+', '', regex=True)
trans = trans[['Feature', 'Symbol', 'ensemblID', 'logFC', 'adj.P.Val']]#.rename(columns={'gene_name': 'Symbol'})
trans['Type'] = 'transcript'
trans.head()

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type
ENST00000602495.1,ENST00000602495.1,XIST,ENSG00000229807,-7.591232,1.889639e-242,transcript
ENST00000416330.1,ENST00000416330.1,XIST,ENSG00000229807,-7.791829,7.035923999999999e-237,transcript
ENST00000429829.5,ENST00000429829.5,XIST,ENSG00000229807,-9.537955,6.043774e-225,transcript
ENST00000440408.5,ENST00000440408.5,TTTY15,ENSG00000233864,6.00699,3.1536850000000002e-208,transcript
ENST00000469599.6,ENST00000469599.6,KDM5D,ENSG00000012817,7.765403,5.9089090000000005e-179,transcript


### Exons

In [4]:
exons = pd.read_csv('../../_m/exons/diffExpr_maleVfemale_full.txt', sep='\t', index_col=0)
exons = exons[(exons['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
exons['Feature'] = exons.index
exons = exons[['Feature', 'Symbol', 'ensemblID', 'logFC', 'adj.P.Val']]
exons['Type'] = 'exon'
exons.head()

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type
e1160419,e1160419,XIST,ENSG00000229807,-8.422475,3.426969e-266,exon
e1160408,e1160408,XIST,ENSG00000229807,-9.196433,5.9948369999999995e-266,exon
e1160412,e1160412,XIST,ENSG00000229807,-8.558761,8.170266e-259,exon
e1160425,e1160425,XIST,ENSG00000229807,-7.145158,9.697123e-259,exon
e1160415,e1160415,XIST,ENSG00000229807,-8.677185,1.53574e-258,exon


### Junctions

In [5]:
juncs = pd.read_csv('../../_m/junctions/diffExpr_maleVfemale_full.txt', sep='\t', index_col=0)
juncs = juncs[(juncs['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
juncs['Feature'] = juncs.index
juncs = juncs[['Feature', 'Symbol', 'ensemblID', 'logFC', 'adj.P.Val']]
juncs['Type'] = 'junction'
juncs.head()

Unnamed: 0,Feature,Symbol,ensemblID,logFC,adj.P.Val,Type
chrX:73833375-73837439(-),chrX:73833375-73837439(-),XIST,ENSG00000229807,-8.507125,9.950701e-239,junction
chrX:73829232-73831065(-),chrX:73829232-73831065(-),XIST,ENSG00000229807,-8.765753,6.553399000000001e-231,junction
chrX:73837504-73841381(-),chrX:73837504-73841381(-),XIST,ENSG00000229807,-8.230007,1.415462e-230,junction
chrX:73831275-73833237(-),chrX:73831275-73833237(-),XIST,ENSG00000229807,-8.956222,8.371272e-226,junction
chrX:73822217-73826114(-),chrX:73822217-73826114(-),XIST,ENSG00000229807,-6.490331,6.618686e-213,junction


## DE summary

### DE (feature)

In [6]:
gg = len(set(genes['Feature']))
tt = len(set(trans['Feature']))
ee = len(set(exons['Feature']))
jj = len(set(juncs['Feature']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		387
Transcript:	475
Exon:		1511
Junction:	775


#### DE (EnsemblID)

In [7]:
gg = len(set(genes['ensemblID']))
tt = len(set(trans['ensemblID']))
ee = len(set(exons['ensemblID']))
jj = len(set(juncs['ensemblID']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		387
Transcript:	299
Exon:		273
Junction:	142


#### DE (Gene Symbol)

In [8]:
gg = len(set(genes['Symbol']))
tt = len(set(trans['Symbol']))
ee = len(set(exons['Symbol']))
jj = len(set(juncs['Symbol']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		340
Transcript:	290
Exon:		236
Junction:	142


### Feature effect size summary

In [9]:
feature_list = ['Genes', 'Transcript', 'Exons', 'Junctions']
feature_df = [genes, trans, exons, juncs]
for ii in range(4):
    ff = feature_df[ii]
    half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].Feature))
    one = len(set(ff[(np.abs(ff['logFC']) >= 1)].Feature))
    print("\nThere are %d unique %s with abs(log2FC) >= 0.5" % (half, feature_list[ii]))
    print("There are %d unique %s with abs(log2FC) >= 1" % (one, feature_list[ii]))


There are 80 unique Genes with abs(log2FC) >= 0.5
There are 41 unique Genes with abs(log2FC) >= 1

There are 242 unique Transcript with abs(log2FC) >= 0.5
There are 152 unique Transcript with abs(log2FC) >= 1

There are 647 unique Exons with abs(log2FC) >= 0.5
There are 411 unique Exons with abs(log2FC) >= 1

There are 374 unique Junctions with abs(log2FC) >= 0.5
There are 225 unique Junctions with abs(log2FC) >= 1


In [10]:
feature_list = ['Genes', 'Transcripts', 'Exons', 'Junctions']
feature_df = [genes, trans, exons, juncs]
for ii in range(4):
    ff = feature_df[ii]
    half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].ensemblID))
    one = len(set(ff[(np.abs(ff['logFC']) >= 1)].ensemblID))
    print("\nThere are %d unique %s with abs(log2FC) >= 0.5" % (half, feature_list[ii]))
    print("There are %d unique %s with abs(log2FC) >= 1" % (one, feature_list[ii]))


There are 80 unique Genes with abs(log2FC) >= 0.5
There are 41 unique Genes with abs(log2FC) >= 1

There are 125 unique Transcripts with abs(log2FC) >= 0.5
There are 72 unique Transcripts with abs(log2FC) >= 1

There are 81 unique Exons with abs(log2FC) >= 0.5
There are 41 unique Exons with abs(log2FC) >= 1

There are 45 unique Junctions with abs(log2FC) >= 0.5
There are 21 unique Junctions with abs(log2FC) >= 1


## Autosomal only

In [12]:
import functools
from gtfparse import read_gtf

In [13]:
@functools.lru_cache()
def get_gtf(gtf_file):
    return read_gtf(gtf_file)


In [14]:
def gene_annotation(gtf_file, feature):
    gtf0 = get_gtf(gtf_file)
    gtf = gtf0[gtf0["feature"] == feature]
    return gtf[["gene_id", "gene_name", "transcript_id", "exon_id",
                "gene_type", "seqname", "start", "end", "strand"]]

In [15]:
gtf_file = '/ceph/genome/human/gencode25/gtf.CHR/_m/gencode.v25.annotation.gtf'

### Genes

In [16]:
gtf_annot = gene_annotation(gtf_file, 'gene')

genes = pd.read_csv('../../_m/genes/diffExpr_maleVfemale_full.txt', sep='\t', index_col=0)
genes = genes[(genes['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
genes['Feature'] = genes.index
genes = pd.merge(gtf_annot[['gene_id', 'seqname']], genes, left_on='gene_id', right_on='Feature', how='right')
genes.loc[:, 'seqname'] = genes.seqname.fillna('chr?')
genes.sort_values('adj.P.Val').to_csv('chrom_annotation_genes.txt', sep='\t', index=False)
genes = genes[(genes.seqname.str.contains('chr\d+')) | (genes['seqname'] == 'chr?')].copy().rename(columns={'seqname': 'chr'})
genes = genes[['Feature', 'chr', 'Symbol', 'ensemblID', 'logFC', 'adj.P.Val']]
genes['Type'] = 'gene'
genes.head()

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_status', 'gene_name', 'level', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_status', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


Unnamed: 0,Feature,chr,Symbol,ensemblID,logFC,adj.P.Val,Type
51,ENSG00000205611.4,chr20,LINC01597,ENSG00000205611,1.294257,2.0986899999999998e-19,gene
53,ENSG00000283443.1,chr20,,ENSG00000283443,1.406877,3.751879e-18,gene
54,ENSG00000149531.15,chr20,FRG1BP,ENSG00000149531,0.669805,1.253717e-17,gene
56,ENSG00000095932.6,chr19,SMIM24,ENSG00000095932,-0.888752,7.19293e-16,gene
57,ENSG00000282826.1,chr20,FRG1CP,ENSG00000282826,0.553323,1.415353e-15,gene


In [17]:
genes[(genes.chr == 'chr?')]

Unnamed: 0,Feature,chr,Symbol,ensemblID,logFC,adj.P.Val,Type


### Annotate unknown by hand
There are none.

In [17]:
#genes = genes[~(genes['Symbol'].isin(['NLGN4Y', 'JPX', 'PCDH11X', 'GABRE']))]
genes.to_csv('autosomal_DEG.csv', index=False, header=True)
genes.shape

(225, 7)

In [42]:
genes.groupby('ensemblID').first().reset_index().shape

(304, 7)

### Transcripts

In [23]:
trans = pd.read_csv('../../_m/transcripts/diffExpr_maleVfemale_full.txt', sep='\t', index_col=0)
trans = trans[(trans['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
trans.loc[:, 'Feature'] = trans.index
trans.loc[:, 'ensemblID'] = trans.gene_id.str.replace('\\.\d+', '', regex=True)
trans = trans[['Feature', 'chr', 'Symbol', 'ensemblID', 'logFC', 'adj.P.Val']]#.rename(columns={'gene_name': 'Symbol'})
trans['Type'] = 'transcript'
trans.head()

Unnamed: 0,Feature,chr,Symbol,ensemblID,logFC,adj.P.Val,Type
41,ENST00000550058.1,chr12,METTL25,ENSG00000127720,3.903413,1.210374e-125,transcript
71,ENST00000609745.1,chr20,SDCBP2-AS1,ENSG00000234684,-0.974359,9.827801e-70,transcript
114,ENST00000474345.5,chr1,FDPS,ENSG00000160752,2.331254,2.885128e-32,transcript
129,ENST00000551722.1,chr12,METTL25,ENSG00000127720,0.762351,2.630845e-22,transcript
132,ENST00000414784.1,chr2,AC012442.5,ENSG00000243389,0.706171,1.2414959999999999e-20,transcript


In [24]:
trans[(trans.chr == 'chr?')]

Unnamed: 0,Feature,chr,Symbol,ensemblID,logFC,adj.P.Val,Type


### Annotate unknown by hand
There are none.

In [27]:
#trans = trans[~(trans['Symbol'].isin(['NLGN4Y']))]
trans.to_csv('transcripts_autosomal_DE.csv', index=False, header=True)
trans.shape

(195, 7)

In [41]:
trans.groupby('ensemblID').first().reset_index().shape

(187, 7)

### Exons

In [28]:
gtf_annot = gene_annotation(gtf_file, 'exon')
gtf_annot['ensemblID'] = gtf_annot.gene_id.str.replace('\\.\d+', '', regex=True)

exons = pd.read_csv('../../_m/exons/diffExpr_maleVfemale_full.txt', sep='\t', index_col=0)
exons = exons[(exons['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
exons['Feature'] = exons.index
exons = pd.merge(gtf_annot[['ensemblID', 'seqname']], exons, on='ensemblID', how='right')
exons.loc[:, 'seqname'] = exons.seqname.fillna('chr?')
exons = exons[(exons.seqname.str.contains('chr\d+')) | (exons['seqname'] == 'chr?')].copy().rename(columns={'seqname': 'chr'})
exons = exons[['Feature', 'chr', 'Symbol', 'ensemblID', 'logFC', 'adj.P.Val']].groupby('Feature').first().reset_index()
exons['Type'] = 'exon'
exons.head()

Unnamed: 0,Feature,chr,Symbol,ensemblID,logFC,adj.P.Val,Type
0,e1011449,chr19,SMIM24,ENSG00000095932,-0.891594,1.400739e-14,exon
1,e1011451,chr19,SMIM24,ENSG00000095932,-0.888166,2.409669e-14,exon
2,e1011454,chr19,SMIM24,ENSG00000095932,-0.877536,4.588435e-14,exon
3,e1013243,chr19,PLIN5,ENSG00000214456,-0.462121,0.01014225,exon
4,e1013248,chr19,PLIN5,ENSG00000214456,-0.290861,0.0005410353,exon


In [29]:
exons[(exons['chr'] == 'chr?')].groupby('ensemblID').first().reset_index()

Unnamed: 0,ensemblID,Feature,chr,Symbol,logFC,adj.P.Val,Type


### Annotate unknown by hand
There are none.

In [30]:
#exons = exons[~(exons['ensemblID'].isin(['ENSG00000269941']))]
exons.to_csv('exons_autosomal_DE.csv', index=False, header=True)
exons.shape

(520, 7)

In [40]:
exons.groupby('ensemblID').first().reset_index().shape

(195, 7)

### Junctions

In [32]:
juncs = pd.read_csv('../../_m/junctions/diffExpr_maleVfemale_full.txt', sep='\t', index_col=0)
juncs = juncs[(juncs['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
juncs['Feature'] = juncs.index
juncs = pd.merge(gtf_annot[['ensemblID', 'seqname']], juncs, on='ensemblID', how='right')
juncs.loc[:, 'seqname'] = juncs.seqname.fillna('chr?')
juncs = juncs[(juncs.seqname.str.contains('chr\d+')) | (juncs['seqname'] == 'chr?')].copy().rename(columns={'seqname': 'chr'})
juncs = juncs[['Feature', 'chr', 'Symbol', 'ensemblID', 'logFC', 'adj.P.Val']].groupby('Feature').first().reset_index()
juncs['Type'] = 'junction'
juncs.head()

Unnamed: 0,Feature,chr,Symbol,ensemblID,logFC,adj.P.Val,Type
0,chr10:11314271-11320856(+),chr10,CELF2,ENSG00000048740,0.528804,0.0254,junction
1,chr10:46911502-46943917(+),chr10,PTPN20,ENSG00000204179,0.609018,0.006528,junction
2,chr10:46946676-46999911(+),chr10,PTPN20,ENSG00000204179,0.644872,3.5e-05,junction
3,chr10:60106060-60108829(-),chr10,ANK3,ENSG00000151150,0.131654,0.029445,junction
4,chr10:60264021-60270130(-),chr10,ANK3,ENSG00000151150,0.1106,0.024851,junction


In [33]:
juncs[(juncs['chr'] == 'chr?')].groupby('ensemblID').first()

Unnamed: 0_level_0,Feature,chr,Symbol,logFC,adj.P.Val,Type
ensemblID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


### Annotate unknown by hand
None unknown

In [34]:
juncs.to_csv('junctions_autosomal_DE.csv', index=False, header=True)
juncs.shape

(239, 7)

In [36]:
juncs.groupby('ensemblID').first().reset_index().shape

(89, 7)

## DE summary

### DE (feature)

In [37]:
gg = len(set(genes['Feature']))
tt = len(set(trans['Feature']))
ee = len(set(exons['Feature']))
jj = len(set(juncs['Feature']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		304
Transcript:	195
Exon:		520
Junction:	239


#### DE (EnsemblID)

In [39]:
gg = len(set(genes.groupby('ensemblID').first().reset_index()['ensemblID']))
tt = len(set(trans.groupby('ensemblID').first().reset_index()['ensemblID']))
ee = len(set(exons.groupby('ensemblID').first().reset_index()['ensemblID']))
jj = len(set(juncs.groupby('ensemblID').first().reset_index()['ensemblID']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		304
Transcript:	187
Exon:		195
Junction:	89


#### DE (Gene Symbol)

In [43]:
gg = len(set(genes.groupby('Symbol').first().reset_index()['Symbol']))
tt = len(set(trans.groupby('Symbol').first().reset_index()['Symbol']))
ee = len(set(exons.groupby('Symbol').first().reset_index()['Symbol']))
jj = len(set(juncs.groupby('Symbol').first().reset_index()['Symbol']))

print("\nGene:\t\t%d\nTranscript:\t%d\nExon:\t\t%d\nJunction:\t%d" % (gg, tt, ee, jj)) 


Gene:		263
Transcript:	187
Exon:		170
Junction:	89


### Feature effect size summary

In [29]:
feature_list = ['Genes', 'Transcript', 'Exons', 'Junctions']
feature_df = [genes, trans, exons, juncs]
for ii in range(4):
    ff = feature_df[ii]
    half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].Feature))
    one = len(set(ff[(np.abs(ff['logFC']) >= 1)].Feature))
    print("\nThere are %d unique %s with abs(log2FC) >= 0.5" % (half, feature_list[ii]))
    print("There are %d unique %s with abs(log2FC) >= 1" % (one, feature_list[ii]))


There are 29 unique Genes with abs(log2FC) >= 0.5
There are 5 unique Genes with abs(log2FC) >= 1

There are 70 unique Transcript with abs(log2FC) >= 0.5
There are 29 unique Transcript with abs(log2FC) >= 1

There are 84 unique Exons with abs(log2FC) >= 0.5
There are 6 unique Exons with abs(log2FC) >= 1

There are 103 unique Junctions with abs(log2FC) >= 0.5
There are 46 unique Junctions with abs(log2FC) >= 1


In [30]:
feature_list = ['Genes', 'Transcripts', 'Exons', 'Junctions']
feature_df = [genes, trans, exons, juncs]
for ii in range(4):
    ff = feature_df[ii]
    half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].ensemblID))
    one = len(set(ff[(np.abs(ff['logFC']) >= 1)].ensemblID))
    print("\nThere are %d unique %s with abs(log2FC) >= 0.5" % (half, feature_list[ii]))
    print("There are %d unique %s with abs(log2FC) >= 1" % (one, feature_list[ii]))


There are 29 unique Genes with abs(log2FC) >= 0.5
There are 5 unique Genes with abs(log2FC) >= 1

There are 66 unique Transcripts with abs(log2FC) >= 0.5
There are 28 unique Transcripts with abs(log2FC) >= 1

There are 25 unique Exons with abs(log2FC) >= 0.5
There are 4 unique Exons with abs(log2FC) >= 1

There are 13 unique Junctions with abs(log2FC) >= 0.5
There are 2 unique Junctions with abs(log2FC) >= 1
