# Feature summary analysis

In [1]:
import numpy as np
import pandas as pd

## Summary plots

### Genes

In [2]:
genes = pd.read_csv('../../_m/genes/diffExpr_maleVfemale_full.txt', sep='\t')
genes = genes[(genes['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
genes.head()

Unnamed: 0,gene_id,logFC,AveExpr,t,P.Value,adj.P.Val,z.std,ensembl_gene_id,position,Direction,hgnc_symbol,percentage_gene_gc_content,gene_biotype,chromosome_name
0,ENSG00000229236.1,5.437461,-2.069655,73.878955,9.467245e-258,1.118736e-253,34.294533,ENSG00000229236,1,UP,TTTY10,38.47,lncRNA,Y
1,ENSG00000260197.1,6.467829,-1.391681,77.893569,1.085256e-257,1.118736e-253,34.290554,ENSG00000260197,1,UP,,32.33,lncRNA,Y
2,ENSG00000169953.11,6.116855,-1.634066,66.804279,2.2690959999999997e-240,1.5593979999999998e-236,33.108143,ENSG00000169953,11,UP,HSFY2,39.22,protein_coding,Y
3,ENSG00000067646.11,8.409788,1.570265,62.284416,1.991596e-239,1.026518e-235,33.04253,ENSG00000067646,11,UP,ZFY,37.27,protein_coding,Y
4,ENSG00000067048.16,8.666711,2.702921,61.982116,1.756814e-238,7.244047999999999e-235,32.976635,ENSG00000067048,16,UP,DDX3Y,37.7,protein_coding,Y


In [3]:
genes.sort_values('adj.P.Val').to_csv('chromosome_DEG.csv', index=False, header=True)

## DE summary

### DE (feature)

In [4]:
gg = len(set(genes['gene_id']))

print("\nGene:\t\t%d" % (gg)) 


Gene:		144


### Feature effect size summary

In [5]:
feature_list = ['Genes', 'Transcript', 'Exons', 'Junctions']
feature_df = [genes]
ii = 0

ff = feature_df[ii]
half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].gene_id))
one = len(set(ff[(np.abs(ff['logFC']) >= 1)].gene_id))
print("\nThere are %d unique %s with abs(log2FC) >= 0.5" % (half, feature_list[ii]))
print("There are %d unique %s with abs(log2FC) >= 1" % (one, feature_list[ii]))


There are 38 unique Genes with abs(log2FC) >= 0.5
There are 22 unique Genes with abs(log2FC) >= 1


## Autosomal only

In [6]:
genes.chromosome_name.fillna('?', inplace=True)

auto = genes[(genes.chromosome_name.str.contains('\d+'))].copy()\
        .rename(columns={'chromosome_name': 'chr', 'hgnc_symbol': 'gene_name'})
auto = auto[['gene_id', 'chr', 'gene_name', 'logFC', 'adj.P.Val']]
auto.head()

Unnamed: 0,gene_id,chr,gene_name,logFC,adj.P.Val
42,ENSG00000255346.9,15,NOX5,0.789715,3.651864e-15
43,ENSG00000205611.4,20,LINC01597,0.784891,4.747301e-15
44,ENSG00000258484.3,15,SPESP1,0.676286,7.799322e-14
45,ENSG00000261600.1,2,,-0.80655,1.521189e-12
47,ENSG00000237268.2,7,,-1.01553,1.158998e-11


In [7]:
auto.sort_values('adj.P.Val').to_csv('autosomal_DEG.csv', index=False, header=True)

## DE summary

### DE (feature)

In [8]:
gg = len(set(auto['gene_id']))

print("\nGene:\t\t%d" % (gg)) 


Gene:		78


### Feature effect size summary

In [9]:
feature_list = ['Genes', 'Transcript', 'Exons', 'Junctions']
feature_df = [auto]
ii = 0

ff = feature_df[ii]
half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].gene_id))
one = len(set(ff[(np.abs(ff['logFC']) >= 1)].gene_id))
print("\nThere are %d unique %s with abs(log2FC) >= 0.5" % (half, feature_list[ii]))
print("There are %d unique %s with abs(log2FC) >= 1" % (one, feature_list[ii]))


There are 10 unique Genes with abs(log2FC) >= 0.5
There are 1 unique Genes with abs(log2FC) >= 1
