In [None]:
import sys
assert sys.version_info.major == 2, "python version is {}".format(sys.version_info.major)
print(sys.version)

In [None]:
import math
import matplotlib as mpl
import matplotlib
matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import subprocess

In [None]:
sys.path.append('../../code/')

In [None]:
from utils import split_df, aggregate_df

In [None]:
# ! ls -l ../../data/genomes_curated.tsv

In [None]:
# ! head ../../data/genomes_curated.tsv

In [None]:
## Create a pandas dataframe to hold the gene expression data
GENE_EXP_DATA = "../../data/raw_data/3_summary_rpkm.xls"
meta4_exp = pd.read_csv(GENE_EXP_DATA, sep='\t')
genomes = pd.read_csv('../../data/genomes_curated.tsv', sep='\t')

In [None]:
meta4_exp = meta4_exp.merge(genomes)
meta4_exp.head(3)

In [None]:
split_by_type = split_df(meta4_exp, 'type')
split_by_type.keys()

In [None]:
split_by_type['m'].shape

In [None]:
split_by_type['nmm'].shape

In [None]:
methanotroph_expression = aggregate_df(split_by_type['m'], 'product', colnorm=False)
methanotroph_expression.shape

In [None]:
methanotroph_expression.head()

In [None]:
methylotroph_expression = aggregate_df(split_by_type['nmm'], 'product', colnorm=False)
methylotroph_expression.shape

Remove rows with zero variance.  R's CCA function won't tolerate them.

In [None]:
methanotroph_expression.shape

In [None]:
methanotroph_expression = \
    methanotroph_expression.loc[methanotroph_expression.std(axis=1) > 0.001, :]

In [None]:
methanotroph_expression.shape

In [None]:
print(methylotroph_expression.shape)
methylotroph_expression = \
    methylotroph_expression.loc[methylotroph_expression.std(axis=1) > 0.001, :]
print(methylotroph_expression.shape)

In [None]:
methanotroph_expression.head()

In [None]:
# ! mkdir ../../data/m_nmm_expression--sum_by_gene

In [None]:
data_dir = '../../data/m_nmm_expression--sum_by_gene/'
methanotroph_expression.T.to_csv(data_dir + 
                               "methanotroph_expression_pooled_on_gene_name.tsv", 
                               sep='\t')
methylotroph_expression.T.to_csv(data_dir + 
                               "methylotroph_expression_pooled_on_gene_name.tsv", 
                               sep='\t')

In [None]:
methanotroph_expression.T.shape

Save the gene names

In [None]:
methanotroph_expression.index.to_series().head()

In [None]:
m_gene_names = methanotroph_expression.copy().reset_index()['product']
nmm_gene_names = methylotroph_expression.copy().reset_index()['product']

In [None]:
m_gene_names.head()

In [None]:
m_gene_names.to_csv(
    data_dir + "methanotroph_gene_names.tsv", sep='\t', index=False)
nmm_gene_names.to_csv(
    data_dir + "methylotroph_gene_names.tsv", sep='\t', index=False)

In [None]:
pd.read_csv(data_dir + "methanotroph_expression_pooled_on_gene_name.tsv", 
            sep='\t').head()

In [None]:
pd.read_csv(data_dir + "methylotroph_expression_pooled_on_gene_name.tsv", 
            sep='\t').head()

## Split data into cross-val folds and write to tsv files

In [None]:
[s for s in methanotroph_expression.index.tolist() if 'hypothetical' in s]

In [None]:
[s for s in methanotroph_expression.index.tolist() if 'unknown' in s]

In [None]:
m_remove_lbls = [s for s in methanotroph_expression.index.tolist() if 'hypothetical' in s or 'unknown' in s]
methanotroph_expression.loc[m_remove_lbls]

In [None]:
nmm_remove_lbls = [s for s in methylotroph_expression.index.tolist() if 'hypothetical' in s or 'unknown' in s]
methylotroph_expression.loc[nmm_remove_lbls]

In [None]:
def filter_genes(df, remove_tags):
    keep_lbls = [s for s in df.index.tolist() if not any(x in s for x in remove_tags)]
    return df.loc[keep_lbls]


In [None]:
m_filtered = filter_genes(methanotroph_expression, ['hypothetical','unknown'])
nmm_filtered = filter_genes(methylotroph_expression, ['hypothetical','unknown'])

In [None]:
print m_filtered.shape
print nmm_filtered.shape

In [None]:
print "# hypothetical/unknown methanotroph genes:", (methanotroph_expression.shape[0]-m_filtered.shape[0])
print "# hypothetical/unknown methylotroph genes:", (methylotroph_expression.shape[0]-nmm_filtered.shape[0])

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
m_ss = ss.fit_transform(m_filtered)
nmm_ss = ss.fit_transform(nmm_filtered)

In [None]:
from sklearn.cross_validation import KFold

In [None]:
N = methanotroph_expression.shape[1]
print N

In [None]:
m_ss.shape

In [None]:
kf = KFold(n=N, n_folds=4, shuffle=True, random_state=100)

In [None]:
data_dir = '../../data/cross_val_data/'
fold_num = 1
for train, val in kf:
    m_str = data_dir + "methanotroph_fold" + str(fold_num)
    nmm_str = data_dir + "methylotroph_fold" + str(fold_num)
    
    train_m = m_ss[:,train]
    val_m = m_ss[:,val]
    train_nmm = nmm_ss[:,train]
    val_nmm = nmm_ss[:,val]
    
    print train_m.T.shape
    print val_m.T.shape
    np.savetxt(m_str + "_train.tsv", train_m.T, delimiter='\t')
    np.savetxt(m_str + "_val.tsv", val_m.T, delimiter='\t')
    np.savetxt(nmm_str + "_train.tsv", train_nmm.T, delimiter='\t')
    np.savetxt(nmm_str + "_val.tsv", val_nmm.T, delimiter='\t')
    
    fold_num += 1