# Feature summary analysis

In [1]:
import numpy as np
import pandas as pd

## Summary plots

### MSSM Penn Pitt

In [2]:
mpp = pd.read_csv('../../_m/mssm_penn_pitt_maleVfemale.tsv', sep='\t')
mpp = mpp[(mpp['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
mpp.head()

Unnamed: 0,Geneid,logFC,AveExpr,t,P.Value,adj.P.Val,B,Coef,Symbol,Entrez,Chrom
0,ENSG00000241859.7,7.426322,0.21193,106.905258,0.0,0.0,794.024282,Reported_GenderMale,ANOS2P,,Y
1,ENSG00000206159.11,6.829826,-0.45465,101.687435,0.0,0.0,769.944857,Reported_GenderMale,GYG2P1,,Y
2,ENSG00000183878.15,8.566599,2.821199,96.624132,0.0,0.0,747.659203,Reported_GenderMale,UTY,7404.0,Y
3,ENSG00000099725.14,7.234079,0.09407,96.796064,0.0,0.0,747.08927,Reported_GenderMale,PRKY,,Y
4,ENSG00000215580.11,6.602361,-0.667862,95.449115,0.0,0.0,739.995713,Reported_GenderMale,BCORP1,,Y


### NIMH HBCC

In [3]:
hbcc = pd.read_csv('../../_m/nimh_hbcc_maleVfemale.tsv', sep='\t')
hbcc = hbcc[(hbcc['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
hbcc.head()

Unnamed: 0,Geneid,logFC,AveExpr,t,P.Value,adj.P.Val,B,Coef,Symbol,Entrez,Chrom
0,ENSG00000229807.11,-11.504527,1.451875,-158.907479,9.677013000000001e-244,1.8493739999999998e-239,541.117577,Reported_GenderMale,XIST,,X
1,ENSG00000241859.7,8.165803,0.195418,98.530405,4.690014e-195,4.4815430000000004e-191,425.604156,Reported_GenderMale,ANOS2P,,Y
2,ENSG00000206159.11,7.594584,-0.185954,92.012611,3.7274810000000005e-188,2.37453e-184,411.071479,Reported_GenderMale,GYG2P1,,Y
4,ENSG00000215580.11,6.980341,-0.607569,89.120159,6.068670999999999e-185,2.8994590000000004e-181,404.120608,Reported_GenderMale,BCORP1,,Y
3,ENSG00000067646.12,9.872471,2.014148,88.829183,1.293023e-184,4.942193e-181,404.158301,Reported_GenderMale,ZFY,7544.0,Y


## DE summary

### DE (feature)

In [5]:
gg1 = len(set(mpp['Geneid']))
gg2 = len(set(hbcc['Geneid']))

print("Gene MPP:\t%d\nGene HBCC:\t%d" % (gg1, gg2)) 

Gene MPP:		482
Gene HBCC:		148


### Feature effect size summary

In [8]:
feature_list = ['Genes: MPP', 'Genes: HBCC', 'Exons', 'Junctions']
feature_df = [mpp, hbcc]
ii = 0

for ii in [0,1]:
    ff = feature_df[ii]
    half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].Geneid))
    one = len(set(ff[(np.abs(ff['logFC']) >= 1)].Geneid))
    print("\nThere are %d unique %s with abs(log2FC) >= 0.5" % (half, feature_list[ii]))
    print("There are %d unique %s with abs(log2FC) >= 1" % (one, feature_list[ii]))


There are 36 unique Genes: MPP with abs(log2FC) >= 0.5
There are 25 unique Genes: MPP with abs(log2FC) >= 1

There are 40 unique Genes: HBCC with abs(log2FC) >= 0.5
There are 27 unique Genes: HBCC with abs(log2FC) >= 1


## Autosomal only

In [11]:
mpp.Chrom.fillna('?', inplace=True)
auto1 = mpp[(mpp.Chrom.str.contains('\d+'))].copy()\
        .rename(columns={'Geneid': 'gene_id'})
auto1 = auto1[['gene_id', 'Chrom', 'Symbol', 'logFC', 'adj.P.Val']]
print(auto1.shape)
auto1.head()

(418, 5)


Unnamed: 0,gene_id,Chrom,Symbol,logFC,adj.P.Val
33,ENSG00000205611.4,20,LINC01597,0.977532,3.6259630000000002e-25
35,ENSG00000255346.10,15,NOX5,0.964489,8.867594e-22
38,ENSG00000283443.1,20,AC018688.1,0.879612,8.23459e-18
39,ENSG00000258484.4,15,SPESP1,0.70139,3.003091e-17
41,ENSG00000149531.15,20,FRG1BP,0.528819,6.561622e-13


In [7]:
auto1.sort_values('adj.P.Val').to_csv('autosomal_DEG_mpp.csv', index=False, header=True)

In [13]:
hbcc.Chrom.fillna('?', inplace=True)
auto2 = hbcc[(hbcc.Chrom.str.contains('\d+'))].copy()\
        .rename(columns={'Geneid': 'gene_id'})
auto2 = auto2[['gene_id', 'Chrom', 'Symbol', 'logFC', 'adj.P.Val']]
print(auto2.shape)
auto2.head()

(98, 5)


Unnamed: 0,gene_id,Chrom,Symbol,logFC,adj.P.Val
35,ENSG00000095932.6,19,SMIM24,-0.813149,9.014157e-15
37,ENSG00000149531.15,20,FRG1BP,0.739522,6.859297e-12
41,ENSG00000283443.1,20,AC018688.1,1.149979,2.357916e-09
42,ENSG00000205611.4,20,LINC01597,1.015293,8.239587e-09
43,ENSG00000258484.4,15,SPESP1,0.776058,1.221874e-08


In [7]:
auto2.sort_values('adj.P.Val').to_csv('autosomal_DEG_hbcc.csv', index=False, header=True)

## DE summary

### DE (feature)

In [15]:
gg1 = len(set(auto1['gene_id']))
gg2 = len(set(auto2['gene_id']))

print("Gene MPP:\t%d\nGene HBCC:\t%d" % (gg1, gg2))

Gene MPP:	418
Gene HBCC:	98


### Feature effect size summary

In [17]:
feature_list = ['Genes: MPP', 'Genes: HBCC', 'Exons', 'Junctions']
feature_df = [auto1, auto2]
ii = 0

for ii in [0,1]:
    ff = feature_df[ii]
    half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].gene_id))
    one = len(set(ff[(np.abs(ff['logFC']) >= 1)].gene_id))
    print("\nThere are %d unique %s with abs(log2FC) >= 0.5" % (half, feature_list[ii]))
    print("There are %d unique %s with abs(log2FC) >= 1" % (one, feature_list[ii]))


There are 8 unique Genes: MPP with abs(log2FC) >= 0.5
There are 0 unique Genes: MPP with abs(log2FC) >= 1

There are 12 unique Genes: HBCC with abs(log2FC) >= 0.5
There are 2 unique Genes: HBCC with abs(log2FC) >= 1
