# Detect bimodal distribution for DEGs by ethnicity

In [1]:
import functools
import numpy as np
import pandas as pd
from sklearn import mixture

In [2]:
config = {
    'deg_file': '../../_m/genes/diffExpr_EAvsAA_full.txt',
    'res_file': '../../_m/genes/residualized_expression.tsv',
    'pheno_file': '../../../../../input/phenotypes/_m/dg_phenotypes.csv',
}

In [3]:
@functools.lru_cache()
def get_deg():
    ''' Take significant DE genes obtained from limma-voom pipeline.
    '''
    return pd.read_csv(config['deg_file'], sep='\t', index_col=0)


@functools.lru_cache()
def get_deg_sig(fdr=0.05):
    ''' Take significant DE genes obtained from limma-voom pipeline.
    '''
    dft = get_deg()
    return dft[(dft['adj.P.Val'] < fdr)]


@functools.lru_cache()
def get_residualized():
    '''Load residualization file.
    '''
    return pd.read_csv(config['res_file'], sep='\t', index_col=0).transpose()


@functools.lru_cache()
def get_deg_res_df():
    res_df = get_residualized()
    return res_df[np.array(get_deg().index)]


@functools.lru_cache()
def get_pheno_data():
    return pd.read_csv(config['pheno_file'], index_col=0)


@functools.lru_cache()
def get_pheno_res_df():
    return get_pheno_data().merge(get_deg_res_df(), left_index=True, right_index=True)


In [5]:
def gaussian_mixture_test(X, num=7):
    lowest_bic = np.infty
    bic = []
    n_components_range = range(1, num)
    for n_components in n_components_range:
        # Fit a Gaussian mixture with EM
        gmm = mixture.GaussianMixture(n_components=n_components, 
                                      covariance_type='spherical')
        gmm.fit(X)
        bic.append(gmm.bic(X))
        if bic[-1] < lowest_bic:
            best_n = n_components
            lowest_bic = bic[-1]
    return best_n

In [6]:
df = get_pheno_res_df()
aa = df[(df['Race'] == 'AA')].copy()
ea = df[(df['Race'] == 'CAUC')].copy()
modality_aa = []; modality_ea = [];

for gene in np.array(get_deg_sig().gencodeID):
    Xaa = np.array(aa[gene]).reshape(-1, 1)
    Xea = np.array(ea[gene]).reshape(-1, 1)
    modality_aa.append(gaussian_mixture_test(Xaa))
    modality_ea.append(gaussian_mixture_test(Xea))

In [7]:
dt = get_deg_sig()
dt.loc[:,'Modality_AA'] = modality_aa
dt.loc[:,'Modality_EA'] = modality_ea
dt.to_csv('degs_with_modality.txt', sep='\t')
dt.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,Length,gencodeID,ensemblID,gene_type,Symbol,EntrezID,Class,meanExprs,NumTx,gencodeTx,logFC,AveExpr,t,P.Value,adj.P.Val,B,Modality_AA,Modality_EA
ENSG00000154237.12,17600,ENSG00000154237.12,ENSG00000154237,protein_coding,LRRK1,79705.0,InGen,0.276253,11,ENST00000388948.7|ENST00000534045.5|ENST000005...,-1.116997,4.13536,-13.621754,4.660148000000001e-28,9.920989999999999e-24,53.11471,1,1
ENSG00000230076.1,318,ENSG00000230076.1,ENSG00000230076,processed_pseudogene,,,InGen,0.598932,1,ENST00000457497.1,-3.861304,-2.091205,-12.567775,3.0812050000000003e-25,3.279788e-21,41.41311,2,1
ENSG00000233913.7,645,ENSG00000233913.7,ENSG00000233913,processed_pseudogene,,,InGen,1.672625,1,ENST00000522551.1,-2.272833,1.484546,-11.622463,1.060691e-22,7.527018e-19,40.276517,2,2
ENSG00000226278.1,261,ENSG00000226278.1,ENSG00000226278,unprocessed_pseudogene,PSPHP1,,InGen,1.446923,1,ENST00000450062.1,-4.320438,-1.326564,-11.566503,1.498506e-22,7.975422e-19,36.911388,2,2
ENSG00000259583.2,6792,ENSG00000259583.2,ENSG00000259583,antisense,,101927751.0,InGen,0.322824,3,ENST00000560351.2|ENST00000560068.1|ENST000005...,-1.017516,2.929559,-11.006935,4.718776e-21,2.0091610000000003e-17,37.168231,1,1
