In [None]:
import numpy as np
import scipy as sp
import pandas as pd

%matplotlib inline

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
GENE_EXP_DATA = "../data/raw_data/3_summary_rpkm.xls"

In [None]:
## Create a pandas dataframe to hold the gene expression data
meta4_exp = pd.read_csv(GENE_EXP_DATA, sep='\t')

Todo:
* curate the list of organisms.  Methanotrophs vs methylotrophs?
    * labels: methanotroph (m), non-methanotrophic methylotroph (nmm), heterotroph (h)
* Does Standard Scalar make sense for this new problem?
* Write method to aggregate separate dataframes based on a list of organism names
* We don't get significance testing with CCA. What are we going to do that is quantitative?
* What tool are we going to use for variable reduction?
    * Use PCA or CCA components, or single genes' expression? 
    * What y are we going to predict? 
* How are we going to do k-fold cross validation?
* Recall how multipl mappings were handled for this set.

In [None]:
# meta4_exp[['genome']].drop_duplicates().to_csv('../data/genomes.tsv', sep='\t', index=False)

In [None]:
genomes = pd.read_csv('../data/genomes_curated.tsv', sep='\t')

In [None]:
genomes.head()

In [None]:
meta4_exp.head(5)

In [None]:
meta4_exp.shape

In [None]:
meta4_exp = meta4_exp.merge(genomes)

In [None]:
meta4_methanotrophs = meta4_exp[meta4_exp['type'] == 'm']
meta4_methanotrophs.shape

In [None]:
meta4_methylotrophs = meta4_exp[meta4_exp['type'] == 'nmm']
meta4_methylotrophs.shape

In [None]:
meta4_heterotrophs = meta4_exp[meta4_exp['type'] == 'h']
meta4_heterotrophs.shape

In [None]:
assert meta4_exp.shape[0] == meta4_methanotrophs.shape[0] + meta4_methylotrophs.shape[0] + meta4_heterotrophs.shape[0]

In [None]:
datasets = {'methanotrophs':meta4_methanotrophs, 
            'methylotrophs':meta4_methylotrophs, 
            'heterotrophs':meta4_heterotrophs}

In [None]:
for name, df in datasets.items():
    rows_before = df.shape[0]
    df.drop(['type'], axis=1, inplace=True)
    #print(df.head(2))
    df = df.groupby(['product'], axis=0).sum()
    rows_after = df.shape[0]
    datasets[name] = df # ?? Why do I need this?
    print('{}. # rows: {} --> {}'.format(name, rows_before, rows_after))
    print(df.head(2))
    print("")
    

In [None]:
datasets['methanotrophs'].shape

In [None]:
datasets['methanotrophs'].head(2)

In [None]:
type(datasets)

In [None]:
transformed_data = {}
ss = StandardScaler()

for name, df in datasets.items():
    shape_before = df.shape
    transformed = ss.fit_transform(df)
    transformed = pd.DataFrame(transformed, columns = df.columns)
    transformed.index = df.index
    print('shape: {} --> {}'.format(shape_before, transformed.shape))
    transformed_data[name] = transformed
    
    

In [None]:
for df in transformed_data.values():
    print(df.head(2))
    print('----------------')

In [None]:
from sklearn.cross_decomposition import CCA

About CCA:
* arguments: 
    * `sklearn.cross_decomposition.CCA(n_components=2, scale=True, max_iter=500, tol=1e-06, copy=True)`
* scale appears to normalize the features by centering and giving each unit variance. 
    * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/cross_decomposition/pls_.py
    * So maybe I didn't need to apply StandardScalar on my own..?

In [None]:
cca = CCA(n_components=1)

In [None]:
cca_fit = cca.fit(X=transformed_data['methanotrophs'].T, Y=transformed_data['methylotrophs'].T)

In [None]:
cca_fit.x_weights_.shape

In [None]:
cca_fit.y_weights_.shape

In [None]:
transformed_data['methanotrophs'].shape

In [None]:
methanotroph_weights = pd.DataFrame(cca_fit.x_weights_, index=transformed_data['methanotrophs'].index)

In [None]:
methylotroph_weights = pd.DataFrame(cca_fit.y_weights_, index=transformed_data['methylotrophs'].index)

In [None]:
methanotroph_weights.head(3)

In [None]:
def summarise_weights(df):
    df_sorted = df.reset_index()
    print(df_sorted.columns)
    df_sorted.rename(columns={0:'weight'}, inplace=True)
    print(df_sorted.head(2))
    print(df_sorted['weight'].abs().head())
    df_sorted['abs(weight)'] = df_sorted['weight'].abs()
    df_sorted.sort_values(by='abs(weight)', ascending=False, inplace=True)
    return df_sorted

In [None]:
methanotroph_weight_summary = summarise_weights(methanotroph_weights)

In [None]:
methanotroph_weight_summary.head()

In [None]:
methylotroph_weight_summary = summarise_weights(methylotroph_weights)
methylotroph_weight_summary.head()

In [None]:
methanotroph_weight_summary['abs(weight)'].plot.hist()

In [None]:
methylotroph_weight_summary['abs(weight)'].plot.hist()