# Feature summary of differential expression analysis

In [None]:
import numpy as np
import pandas as pd

In [None]:
def annotate_DE(feature):
    # Annotate DE results
    df = pd.read_csv(f'../../_m/{feature.lower()}s/diffExpr_maleVfemale_full.txt', 
                     sep='\t', index_col=0)\
           .rename(columns={"gene_id": "gencodeID", "gencodeGeneID": "gencodeID", 
                            "gene_name": "Symbol"})
    df = df[(df['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
    df['Feature'] = df.index
    df['ensemblID'] = df.gencodeID.str.replace("\\..*", "", regex=True)
    df['Type'] = feature; df["Region"] = "DLPFC"
    return df[['Feature', 'Symbol', 'ensemblID', 
               'logFC', 'SE', 'adj.P.Val', "Type"]]

## Summary plots

### Genes

In [None]:
genes = annotate_DE("Gene")
genes.head(2)

### Transcripts

In [None]:
trans = annotate_DE("Transcript")
trans.head(2)

### Exons

In [None]:
exons = annotate_DE("Exon")
exons.head(2)

### Junctions

In [None]:
juncs = annotate_DE("Junction")
juncs.head(2)

## DE summary

### DE (feature)

In [None]:
gg = len(set(genes['Feature']))
tt = len(set(trans['Feature']))
ee = len(set(exons['Feature']))
jj = len(set(juncs['Feature']))

print(f"\nGene:\t\t{gg}\nTranscript:\t{tt}\nExon:\t\t{ee}\nJunction:\t{jj}")

#### DE (EnsemblID)

In [None]:
gg = len(set(genes['ensemblID']))
tt = len(set(trans['ensemblID']))
ee = len(set(exons['ensemblID']))
jj = len(set(juncs['ensemblID']))

print(f"\nGene:\t\t{gg}\nTranscript:\t{tt}\nExon:\t\t{ee}\nJunction:\t{jj}")

#### DE (Gene Symbol)

In [None]:
gg = len(set(genes['Symbol']))
tt = len(set(trans['Symbol']))
ee = len(set(exons['Symbol']))
jj = len(set(juncs['Symbol']))

print(f"\nGene:\t\t{gg}\nTranscript:\t{tt}\nExon:\t\t{ee}\nJunction:\t{jj}")

### Feature effect size summary

In [None]:
feature_list = ['Genes', 'Transcript', 'Exons', 'Junctions']
feature_df = [genes, trans, exons, juncs]
for ii in range(4):
    ff = feature_df[ii]
    half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].Feature))
    one = len(set(ff[(np.abs(ff['logFC']) >= 1)].Feature))
    print(f"\nThere are {half} unique {feature_list[ii]} with abs(log2FC) >= 0.5")
    print(f"There are {one} unique {feature_list[ii]} with abs(log2FC) >= 1")

In [None]:
feature_list = ['Genes', 'Transcripts', 'Exons', 'Junctions']
feature_df = [genes, trans, exons, juncs]
for ii in range(4):
    ff = feature_df[ii]
    half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].ensemblID))
    one = len(set(ff[(np.abs(ff['logFC']) >= 1)].ensemblID))
    print(f"\nThere are {half} unique {feature_list[ii]} with abs(log2FC) >= 0.5")
    print(f"There are {one} unique {feature_list[ii]} with abs(log2FC) >= 1")

## Autosomal only

In [None]:
from pyhere import here
from functools import lru_cache

In [None]:
@lru_cache()
def get_annotation(feature):
    feat_lt = {"gene": "gene", "transcript": "tx", 
               "exon": "exon", "junction": "jxn"}
    new_feature = feat_lt[feature]
    fn = here(f"input/counts/text_files_counts/_m/dlpfc/{new_feature}_annotation.txt")
    return pd.read_csv(fn, sep='\t')

In [None]:
def annotate_autosomes(feature):
    # Get annotation
    annot = get_annotation(feature.lower())
    # Annotate DE results
    df = pd.read_csv(f'../../_m/{feature.lower()}s/diffExpr_maleVfemale_full.txt', 
                     sep='\t', index_col=0)\
           .rename(columns={"gene_id": "gencodeID", "gencodeGeneID": "gencodeID", 
                            "gene_name": "Symbol"})
    df = df[(df['adj.P.Val'] < 0.05)].sort_values('adj.P.Val')
    df['name'] = df.index
    df['ensemblID'] = df.gencodeID.str.replace("\\..*", "", regex=True)
    df = annot.merge(df, on='name').rename(columns={"name": "Feature"})
    df = df[["Feature", "seqnames", "start", "end", "width", "gencodeID", "ensemblID", 
             "Symbol", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B", "SE"]]
    df['Type'] = feature; df["Region"] = "DLPFC"
    # Save annotated file
    df.sort_values('adj.P.Val').to_csv(f'chrom_annotation_{feature.lower()}.txt', 
                                       sep='\t', index=False)
    df = df[(df.seqnames.str.contains('chr\d+'))].copy()
    # Save autosomal DE features
    df.to_csv(f'{feature.lower()}_autosomal_DE.csv', index=False, header=True)
    return df[['Feature', 'seqnames', 'Symbol', 'ensemblID', 'logFC', 'SE', 'adj.P.Val', "Type"]]

### Genes

In [None]:
feature = "Gene"
genes = annotate_autosomes(feature)
genes.head(2)

In [None]:
genes.shape

In [None]:
genes.groupby('ensemblID').first().reset_index().shape

### Transcripts

In [None]:
trans = annotate_autosomes("Transcript")
trans.head(2)
trans.shape

In [None]:
trans.groupby('ensemblID').first().reset_index().shape

### Exons

In [None]:
exons = annotate_autosomes("Exon")
exons.head(2)
exons.shape

In [None]:
exons.groupby('ensemblID').first().reset_index().shape

### Junctions

In [None]:
juncs = annotate_autosomes("Junction")
juncs.head(2)
juncs.shape

In [None]:
juncs.groupby('ensemblID').first().reset_index().shape

## DE summary

### DE (feature)

In [None]:
gg = len(set(genes['Feature']))
tt = len(set(trans['Feature']))
ee = len(set(exons['Feature']))
jj = len(set(juncs['Feature']))

print(f"\nGene:\t\t{gg}\nTranscript:\t{tt}\nExon:\t\t{ee}\nJunction:\t{jj}")

#### DE (EnsemblID)

In [None]:
gg = len(set(genes.groupby('ensemblID').first().reset_index()['ensemblID']))
tt = len(set(trans.groupby('ensemblID').first().reset_index()['ensemblID']))
ee = len(set(exons.groupby('ensemblID').first().reset_index()['ensemblID']))
jj = len(set(juncs.groupby('ensemblID').first().reset_index()['ensemblID']))

print(f"\nGene:\t\t{gg}\nTranscript:\t{tt}\nExon:\t\t{ee}\nJunction:\t{jj}")

#### DE (Gene Symbol)

In [None]:
gg = len(set(genes.groupby('Symbol').first().reset_index()['Symbol']))
tt = len(set(trans.groupby('Symbol').first().reset_index()['Symbol']))
ee = len(set(exons.groupby('Symbol').first().reset_index()['Symbol']))
jj = len(set(juncs.groupby('Symbol').first().reset_index()['Symbol']))

print(f"\nGene:\t\t{gg}\nTranscript:\t{tt}\nExon:\t\t{ee}\nJunction:\t{jj}")

### Feature effect size summary

In [None]:
feature_list = ['Genes', 'Transcript', 'Exons', 'Junctions']
feature_df = [genes, trans, exons, juncs]
for ii in range(4):
    ff = feature_df[ii]
    half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].Feature))
    one = len(set(ff[(np.abs(ff['logFC']) >= 1)].Feature))
    print(f"\nThere are {half} unique {feature_list[ii]} with abs(log2FC) >= 0.5")
    print(f"There are {one} unique {feature_list[ii]} with abs(log2FC) >= 1")

In [None]:
feature_list = ['Genes', 'Transcripts', 'Exons', 'Junctions']
feature_df = [genes, trans, exons, juncs]
for ii in range(4):
    ff = feature_df[ii]
    half = len(set(ff[(np.abs(ff['logFC']) >= 0.5)].ensemblID))
    one = len(set(ff[(np.abs(ff['logFC']) >= 1)].ensemblID))
    print(f"\nThere are {half} unique {feature_list[ii]} with abs(log2FC) >= 0.5")
    print(f"There are {one} unique {feature_list[ii]} with abs(log2FC) >= 1")

## Session information

In [None]:
import session_info
session_info.show()