In [None]:
import sys
assert sys.version_info.major == 2, "python version is {}".format(sys.version_info.major)
print(sys.version)

In [None]:
import math
import matplotlib as mpl
import matplotlib
matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pprint
import seaborn as sns
import subprocess

In [None]:
sys.path.append('../../code/')

In [None]:
from utils import split_df, aggregate_df

In [None]:
# ! ls -l ../../data/genomes_curated.tsv

In [None]:
# ! head ../../data/genomes_curated.tsv

In [None]:
## Create a pandas dataframe to hold the gene expression data
GENE_EXP_DATA = "../../data/raw_data/3_summary_rpkm.xls"
meta4_exp = pd.read_csv(GENE_EXP_DATA, sep='\t')
genomes = pd.read_csv('../../data/genomes_curated.tsv', sep='\t')

In [None]:
meta4_exp = meta4_exp.merge(genomes)
meta4_exp.head(3)

In [None]:
split_by_type = split_df(meta4_exp, 'type')
split_by_type.keys()

In [None]:
split_by_type['m'].shape

In [None]:
split_by_type['nmm'].shape

In [None]:
methanotroph_expression = aggregate_df(split_by_type['m'], 'product', colnorm=False)
methanotroph_expression.shape

In [None]:
methanotroph_expression.head()

In [None]:
methylotroph_expression = aggregate_df(split_by_type['nmm'], 'product', colnorm=False)
methylotroph_expression.shape

Remove rows with zero variance.  R's CCA function won't tolerate them.

In [None]:
methanotroph_expression.shape

In [None]:
# Remove zero-variance rows (genes)
print("before removing zero-var genes: {}".format(methanotroph_expression.shape))
methanotroph_expression = \
    methanotroph_expression.loc[methanotroph_expression.std(axis=1) > 0.001, :]
print("after removing zero-var genes: {}".format(methanotroph_expression.shape))

In [None]:
# Remove zero-variance rows (genes)
print("before removing zero-var genes: {}".format(methylotroph_expression.shape))
methylotroph_expression = \
    methylotroph_expression.loc[methylotroph_expression.std(axis=1) > 0.001, :]
print("after removing zero-var genes: {}".format(methylotroph_expression.shape))

In [None]:
methanotroph_expression.head()

In [None]:
# ! mkdir ../../data/m_nmm_expression--sum_by_gene

In [None]:
data_dir = '../../data/m_nmm_expression--sum_by_gene/'
methanotroph_expression.T.to_csv(data_dir + 
                               "methanotroph_expression_pooled_on_gene_name.tsv", 
                               sep='\t')
methylotroph_expression.T.to_csv(data_dir + 
                               "methylotroph_expression_pooled_on_gene_name.tsv", 
                               sep='\t')

Save the gene names

In [None]:
methanotroph_expression.index.to_series().head(2)

In [None]:
m_gene_names = methanotroph_expression.copy().reset_index()['product']
nmm_gene_names = methylotroph_expression.copy().reset_index()['product']

In [None]:
m_gene_names.head(2)

In [None]:
m_gene_names.to_csv(
    data_dir + "methanotroph_gene_names.tsv", sep='\t', index=False)
nmm_gene_names.to_csv(
    data_dir + "methylotroph_gene_names.tsv", sep='\t', index=False)

## Split data into cross-val folds and write to tsv files

In [None]:
hypothetical_m = [s for s in methanotroph_expression.index.tolist() 
                  if 'hypothetical' in s]
pprint.pprint(hypothetical_m[0:5])

In [None]:
unknown_function_m = [s for s in methanotroph_expression.index.tolist() 
                      if 'unknown' in s]
pprint.pprint(unknown_function_m[0:5])

In [None]:
def filter_genes(df, remove_tags):
    keep_lbls = [s for s in df.index.tolist() 
                 if not any(x in s for x in remove_tags)]
    return df.loc[keep_lbls]


In [None]:
print('methanotroph df before filter: {}'.format(methanotroph_expression.shape))
m_filtered = filter_genes(methanotroph_expression, ['hypothetical','unknown'])
print('methanotroph df after filter: {}'.format(m_filtered.shape))
print('-----')

print('methylotroph df before filter: {}'.format(methylotroph_expression.shape))
nmm_filtered = filter_genes(methylotroph_expression, ['hypothetical','unknown'])
print('methylotroph df after filter: {}'.format(nmm_filtered.shape))

In [None]:
print(data_dir + "methanotroph_expression_pooled_on_gene_name_filtered.tsv")

In [None]:
# 12/08: Save the filtered ones to .tsv too. 
m_filtered.T.to_csv(data_dir + 
                    "methanotroph_expression_pooled_on_gene_name_filtered.tsv",
                    sep='\t')
nmm_filtered.T.to_csv(data_dir + 
                    "methylotroph_expression_pooled_on_gene_name_filtered.tsv", 
                    sep='\t')

m_filtered_gene_names = m_filtered.copy().reset_index()['product']
nmm_filtered_gene_names = nmm_filtered.copy().reset_index()['product']

m_filtered_gene_names.to_csv(
    data_dir + "methanotroph_gene_names_filtered.tsv", sep='\t', index=False)
nmm_gene_names.to_csv(
    data_dir + "methylotroph_gene_names_filtered.tsv", sep='\t', index=False)



In [None]:
print "# hypothetical/unknown methanotroph genes:", (methanotroph_expression.shape[0]-m_filtered.shape[0])
print "# hypothetical/unknown methylotroph genes:", (methylotroph_expression.shape[0]-nmm_filtered.shape[0])

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
# m_ss = ss.fit_transform(m_filtered)
# nmm_ss = ss.fit_transform(nmm_filtered)

In [None]:
m_ss = ss.fit_transform(m_filtered.T)
nmm_ss = ss.fit_transform(nmm_filtered.T)
print m_ss.shape
print nmm_ss.shape

In [None]:
## grab gene names from baseline filtered dataframes
m_genes = m_filtered.copy().reset_index()['product']
nmm_genes = nmm_filtered.copy().reset_index()['product']

In [None]:
# 12/08: Save the filtered & standard-scalar to .tsv too. 
np.savetxt(fname = data_dir + "methanotroph_expression_pooled_on_gene_name_filtered_ss.tsv", 
           X= m_ss, delimiter='\t')
np.savetxt(fname = data_dir + "methylotroph_expression_pooled_on_gene_name_filtered_ss.tsv", 
           X= nmm_ss, delimiter='\t')

m_genes.to_csv(
    data_dir + "methanotroph_gene_names_filtered_ss.tsv", sep='\t', index=False)
nmm_genes.to_csv(
    data_dir + "methylotroph_gene_names_filtered_ss.tsv", sep='\t', index=False)



In [None]:
data_dir + "methanotroph_expression_pooled_on_gene_name_filtered_ss.tsv"

In [None]:
data_dir + "methanotroph_gene_names_filtered_ss.tsv"

In [None]:
nmm_genes[4:7]

In [None]:
def filter_zero_variance(mat):
    keep_inds = np.abs(np.var(mat, axis=0)) != 0 ## find cols with nonzero variance
    return (mat[:,keep_inds], keep_inds)

In [None]:
## filters out cols w/ nonzero variance in EITHER matrix
def filter_zv_multi(mat1, mat2):
    inds1 = np.abs(np.var(mat1, axis=0)) != 0
    inds2 = np.abs(np.var(mat2, axis=0)) != 0
    keep_inds = np.logical_and(inds1, inds2)
    return(mat1[:,keep_inds], mat2[:,keep_inds], keep_inds)

In [None]:
testmat, testinds = filter_zero_variance(m_ss)

In [None]:
testmat.shape

In [None]:
from sklearn.cross_validation import KFold

In [None]:
N = methanotroph_expression.shape[1]
print N

In [None]:
kf = KFold(n=N, n_folds=4, shuffle=True, random_state=100)

In [None]:
data_dir = '../../data/cross_val_data/'
fold_num = 1
for train, val in kf:
    m_str = data_dir + "methanotroph_fold" + str(fold_num) + "_ss_filtered"
    nmm_str = data_dir + "methylotroph_fold" + str(fold_num) + "_ss_filtered"
    
    train_m = ss.fit_transform(m_ss[train,:])
    val_m = ss.fit_transform(m_ss[val,:])
    train_nmm = ss.fit_transform(nmm_ss[train,:])
    val_nmm = ss.fit_transform(nmm_ss[val,:])
    
    ## filter out zero-variance genes and record gene names for m/nmm sets
    ## (train/val for a given dataset must have the same features)
    tmmat, vmmat, minds = filter_zv_multi(train_m, val_m)
    fold_mgenes = np.asarray(m_genes[minds])
    
    tnmm_mat, vnmm_mat, nmminds = filter_zv_multi(train_nmm, val_nmm)
    fold_nmmgenes = np.asarray(nmm_genes[nmminds])
    
    print "Fold#", fold_num
    print "Unfiltered SS train_m matrix:", train_m.shape
    print "Unfiltered SS val_m matrix:", val_m.shape
    print "Nonzero variance train_m matrix:", tmmat.shape
    print "Nonzero variance val_m matrix:", vmmat.shape
    print "Gene name vector for methanotrophs:", fold_mgenes.shape
    
    np.savetxt(m_str + "_train.tsv", tmmat, delimiter='\t')
    np.savetxt(m_str + "_val.tsv", vmmat, delimiter='\t')
    np.savetxt(m_str + "_genes.tsv", fold_mgenes, fmt='%s', delimiter='\t')
    np.savetxt(nmm_str + "_train.tsv", tnmm_mat, delimiter='\t')
    np.savetxt(nmm_str + "_val.tsv", vnmm_mat, delimiter='\t')
    np.savetxt(nmm_str + "_genes.tsv", fold_nmmgenes, fmt='%s', delimiter='\t')
    
    fold_num += 1