# Detect bimodal distribution for DEGs by ethnicity

In [1]:
import functools
import numpy as np
import pandas as pd
from sklearn import mixture

In [2]:
config = {
    'deg_file': '../../_m/genes/diffExpr_EAvsAA_full.txt',
    'res_file': '../../_m/genes/residualized_expression.tsv',
    'pheno_file': '/ceph/projects/v3_phase3_paper/inputs/phenotypes/_m/caudate_phenotypes.csv',
}

In [3]:
@functools.lru_cache()
def get_deg():
    ''' Take significant DE genes obtained from limma-voom pipeline.
    '''
    return pd.read_csv(config['deg_file'], sep='\t', index_col=0)


@functools.lru_cache()
def get_deg_sig(fdr=0.05):
    ''' Take significant DE genes obtained from limma-voom pipeline.
    '''
    dft = get_deg()
    return dft[(dft['adj.P.Val'] < fdr)]


@functools.lru_cache()
def get_residualized():
    '''Load residualization file.
    '''
    return pd.read_csv(config['res_file'], sep='\t', index_col=0).transpose()


@functools.lru_cache()
def get_deg_res_df():
    res_df = get_residualized()
    return res_df[np.array(get_deg().index)]


@functools.lru_cache()
def get_pheno_data():
    return pd.read_csv(config['pheno_file'], index_col=0)


@functools.lru_cache()
def get_pheno_res_df():
    return get_pheno_data().merge(get_deg_res_df(), left_index=True, right_index=True)


In [4]:
def gaussian_mixture_test(X, num=7):
    lowest_bic = np.infty
    bic = []
    n_components_range = range(1, num)
    for n_components in n_components_range:
        # Fit a Gaussian mixture with EM
        gmm = mixture.GaussianMixture(n_components=n_components, 
                                      covariance_type='spherical')
        gmm.fit(X)
        bic.append(gmm.bic(X))
        if bic[-1] < lowest_bic:
            best_n = n_components
            lowest_bic = bic[-1]
    return best_n

In [5]:
df = get_pheno_res_df()
aa = df[(df['Race'] == 'AA')].copy()
ea = df[(df['Race'] == 'CAUC')].copy()
modality_aa = []; modality_ea = [];

for gene in np.array(get_deg_sig().gencodeID):
    Xaa = np.array(aa[gene]).reshape(-1, 1)
    Xea = np.array(ea[gene]).reshape(-1, 1)
    modality_aa.append(gaussian_mixture_test(Xaa))
    modality_ea.append(gaussian_mixture_test(Xea))

In [8]:
dt = get_deg_sig()
dt.loc[:,'Modality_AA'] = modality_aa
dt.loc[:,'Modality_EA'] = modality_ea
dt.to_csv('degs_with_modality.txt', sep='\t')
dt.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,Length,gencodeID,ensemblID,gene_type,Symbol,EntrezID,Class,meanExprs,NumTx,logFC,AveExpr,t,P.Value,adj.P.Val,B,EAvsAA,Modality_AA,Modality_EA
ENSG00000272977.1,3754,ENSG00000272977.1,ENSG00000272977,sense_intronic,,,InGen,0.447451,1,1.745629,1.22822,15.761702,2.5316760000000003e-43,5.6833600000000006e-39,87.219521,1,2,1
ENSG00000182698.11,1140,ENSG00000182698.11,ENSG00000182698,protein_coding,RESP18,389075.0,InGen,2.821051,3,2.717543,1.408244,15.166363,7.186827e-41,8.066854e-37,81.794854,1,2,2
ENSG00000068654.15,13519,ENSG00000068654.15,ENSG00000068654,protein_coding,POLR1A,25885.0,InGen,2.2742,10,0.225524,5.947547,15.000623,3.433158e-40,2.2120359999999998e-36,80.500583,1,1,1
ENSG00000233913.7,645,ENSG00000233913.7,ENSG00000233913,processed_pseudogene,,,InGen,11.503736,1,-2.229455,3.059692,-14.985969,3.941442e-40,2.2120359999999998e-36,80.423603,-1,2,2
ENSG00000204894.4,90,ENSG00000204894.4,ENSG00000204894,processed_pseudogene,,,InGen,4.794246,1,-3.6283,-1.70589,-14.493729,3.9915589999999994e-38,1.79213e-34,72.789887,-1,2,2
