## Cross-validation of sparse CCA

Predict expression of methylotrophy genes from methanotrophy genes.  Previously pooled on species.

First, run: ../notebooks/prepare_data/prepare_X_m_expression_Y_nmm_expression_Xval.ipynb

That fills /Users/janet/ML_meta-omics/data/cross_val_data

In [None]:
import sys
assert sys.version_info.major == 2, "python version is {}".format(sys.version_info.major)
print(sys.version)

In [None]:
import itertools
import matplotlib as mpl
import matplotlib
matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import subprocess
import sys

In [None]:
sys.path.append('../../code/')

from CCA import CcaAnalysis, ExpressionCCA
from cross_validate import CrossValCCA

In [None]:
raw_data_path = '../../data/cross_val_data/'
os.listdir(raw_data_path)
xval_filenames = [f for f in os.listdir(raw_data_path) if '.tsv' in f]  # get rid of non-TSV files

In [None]:
# expression matrices
m_filepath = '../../data/m_nmm_expression--sum_by_gene/methanotroph_expression_pooled_on_gene_name_filtered_ss.tsv'
nmm_filepath = '../../data/m_nmm_expression--sum_by_gene/methylotroph_expression_pooled_on_gene_name_filtered_ss.tsv'

# gene names
m_gene_name_filepath = '../../data/m_nmm_expression--sum_by_gene/methanotroph_gene_names_filtered_ss.tsv'
nmm_gene_name_filepath = '../../data/m_nmm_expression--sum_by_gene/methylotroph_gene_names_filtered_ss.tsv'

In [None]:
cca = ExpressionCCA(
    x_train_filepath=m_filepath, 
    z_train_filepath=nmm_filepath,
    x_gene_filepath=m_gene_name_filepath,
    z_gene_filepath=nmm_gene_name_filepath,
    input_filepath=raw_data_path,
    u_v_output_dir='./final_model_u_and_v_vectors',
    penalty_x=0.15, penalty_z=0.15,
    verbose = False,
    path_to_R_script='../../code/sparse_CCA.R')

In [None]:
cca.get_summary()

In [None]:
cca.x_genes.head()

In [None]:
cca.associate_weights_with_gene_names()

In [None]:
cca.u_with_names.head(2)

In [None]:
cca.u_with_names is None 

In [None]:
cca.v_with_names is None 

In [None]:
u = cca.sorted_weights(vector='u')

In [None]:
u[u['abs(weight)'] > 0]['weight'].plot.hist()

In [None]:
u.head(n=20)

In [None]:
v = cca.sorted_weights(vector='v')
v.head(n=20)

In [None]:
v[v['abs(weight)'] > 0]['weight'].plot.hist()