In [1]:
import numpy as np
import scipy as sp
import pandas as pd
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
GENE_EXP_DATA = "../data/raw_data/3_summary_rpkm.xls"

In [3]:
## Create a pandas dataframe to hold the gene expression data
meta4_exp = pd.read_csv(GENE_EXP_DATA, sep='\t')

In [4]:
genomes = pd.read_csv('../data/genomes_curated.tsv', sep='\t')

In [5]:
meta4_exp = meta4_exp.merge(genomes)
meta4_exp.head(3)

Unnamed: 0,genome,locus_tag,product,LakWasMeta1_LOW4_2_rpkm,LakWasMeta2_LOW4_2_rpkm,LakWasMeta3_LOW4_2_rpkm,LakWasMeta4_LOW4_2_rpkm,LakWasMet10_HOW4_2_rpkm,LakWasMeta7_HOW4_2_rpkm,LakWasMeta8_HOW4_2_rpkm,...,LakWasM118_HOW13_2_rpkm,LakWasM121_LOW14_2_rpkm,LakWasM122_LOW14_2_rpkm,LakWasM123_LOW14_2_rpkm,LakWasM124_LOW14_2_rpkm,LakWasM127_HOW14_2_rpkm,LakWasM128_HOW14_2_rpkm,LakWasM129_HOW14_2_rpkm,LakWasM130_HOW14_2_rpkm,type
0,Methylotenera mobilis JLW8,Mmol_0001,chromosomal replication initiator protein DnaA,1,1,4,2,1,2,1,...,1,3,0,0,22,0,3,0,0,nmm
1,Methylotenera mobilis JLW8,Mmol_0002,"DNA polymerase III, beta subunit (EC 2.7.7.7)",5,4,2,4,3,6,0,...,0,6,0,0,26,0,1,0,0,nmm
2,Methylotenera mobilis JLW8,Mmol_0003,DNA gyrase subunit B (EC 5.99.1.3),5,7,5,3,5,3,0,...,1,10,0,0,28,0,1,0,0,nmm


In [6]:
meta4_methanotrophs = meta4_exp[meta4_exp['type'] == 'm']
meta4_methanotrophs.shape

(41749, 87)

In [7]:
meta4_methylotrophs = meta4_exp[meta4_exp['type'] == 'nmm']
meta4_methylotrophs.shape

(80830, 87)

In [8]:
meta4_heterotrophs = meta4_exp[meta4_exp['type'] == 'h']
meta4_heterotrophs.shape

(90131, 87)

In [9]:
assert meta4_exp.shape[0] == meta4_methanotrophs.shape[0] + meta4_methylotrophs.shape[0] + meta4_heterotrophs.shape[0]

In [46]:
'''Function to split dataframe into sub-dataframes based on a given
column label'''
def split_df(df, split_by):
    cum_entries = 0
    sub_dfs = {}
    
    for lbl in df[split_by].unique():
        sub_dfs[lbl] = df[df[split_by]==lbl]
        cum_entries += sub_dfs[lbl].shape[0]
    
    assert(df.shape[0] == cum_entries)
    return sub_dfs

In [22]:
'''Aggregates values in a dataframe by a given column'''
def aggregate_df(df, collapse_by, colnorm=False):
    agg_df = df.groupby([collapse_by],axis=0).sum()
    if colnorm:
        agg_df = agg_df/agg_df.sum(axis=0)
    return agg_df

In [47]:
sub_dfs = split_df(meta4_exp, 'type')

In [48]:
sub_dfs.keys()

['h', 'nmm', 'm']

In [49]:
for name, df in sub_dfs.items():
    rows_before = df.shape[0]
    sub_dfs[name] = aggregate_df(df,'genome',colnorm=True)
    print('{}. # rows: {} --> {}'.format(name, rows_before, sub_dfs[name].shape[0]))
    print("")

h. # rows: 90131 --> 20

nmm. # rows: 80830 --> 25

m. # rows: 41749 --> 10

