## Cross-validation of sparse CCA

Predict expression of methylotrophy genes from methanotrophy genes.  Previously pooled on species.

First, run: ../notebooks/prepare_data/prepare_X_m_expression_Y_nmm_expression_Xval.ipynb

That fills /Users/janet/ML_meta-omics/data/cross_val_data

In [None]:
import sys
assert sys.version_info.major == 2, "python version is {}".format(sys.version_info.major)
print(sys.version)

In [None]:
import itertools
import matplotlib as mpl
import matplotlib
matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import subprocess
import sys

In [None]:
print(sys.version)

In [None]:
sys.path.append('../../code/')

from CCA import CcaAnalysis, ExpressionCCA
from cross_validate import CrossValCCA

In [None]:
raw_data_path = '../../data/cross_val_data/'
os.listdir(raw_data_path)
xval_filenames = [f for f in os.listdir(raw_data_path) if '.tsv' in f]  # get rid of non-TSV files

In [None]:
def find_Xval_group(string):
    m = re.search('[_A-z]+fold([0-9]+)[._A-z]+', string)
    return int(m.group(1))

groups = list(set([find_Xval_group(f) for f in xval_filenames]))
groups

In [None]:
def get_filename(strings_list, fold, train=True):
    raw_data_path = '../../data/cross_val_data/'
    xval_filenames = os.listdir(raw_data_path)
    
    potential_files = [f for f in xval_filenames
                       if "fold" + str(fold) in f]
    #print("There are {} potential files".format(len(potential_files)))
    
    for s in strings_list:
        potential_files = [f for f in potential_files if s in f]
        
    assert len(potential_files) == 1, "too many file name matches in {}".format(potential_files)
    
    #print("file names found: {}".format(potential_files))
    return potential_files[0]

In [None]:
get_filename(['methanotroph', 'train', 'filtered', 'ss'], fold=1, train=True)

In [None]:
[1] + [2, 3, 4]

In [None]:
summary = pd.DataFrame()

for fold in groups:
    file_specs = ['filtered', 'ss']
    #print('fold: {}'.format(fold))
    row = {}
    row['k'] = fold
    row['m filename, train'] = get_filename(['methanotroph', 'train'] + file_specs, fold)
    row['m filename, val'] = get_filename(['methanotroph', 'val'] + file_specs, fold)
    row['nmm filename, train'] = get_filename(['methylotroph', 'train'] + file_specs, fold)
    row['nmm filename, val'] = get_filename(['methylotroph', 'val'] + file_specs, fold)
    row['gene names'] = get_filename(['methylotroph', 'genes'] + file_specs, fold)
    
    # make a dataframe row
    row = {k:[v] for k, v in row.items()}
    df = pd.DataFrame.from_dict(row)
    #print(df)
    summary = pd.concat([summary, df], axis=0)
summary

### Run sparse CCA on each of these training sets

Start by demo of one set:

In [None]:
summary.head(1)

In [None]:
uv_dir = './crossvalidation_u_and_v_vectors/'
if not os.path.exists(uv_dir):
    os.mkdir(uv_dir)

In [None]:
os.listdir(uv_dir)[0:4]

In [None]:
def run_CCA(summary_df, cv, groups, pen_x, pen_z, verbose=False):
    
    for group in groups:
        fold_results = {}
        
        row = summary_df[summary_df['k'] == group]
        assert row.shape[0] == 1
        
        def prepare_filepath(set):
            filename = row[set][0]
            return os.path.join(raw_data_path, filename)
        
        x_filepath = prepare_filepath('m filename, train')
        print('x_filepath: {}'.format(x_filepath))
        z_filepath = prepare_filepath('nmm filename, train')
        print('z_filepath: {}'.format(z_filepath))
        x_val_filepath = prepare_filepath('m filename, val')
        print('x_val_filepath: {}'.format(x_val_filepath))
        z_val_filepath = prepare_filepath('nmm filename, val')
        print('z_val_filepath: {}'.format(z_val_filepath))
        gene_name_filepath = prepare_filepath('gene names')
        print('gene_name_filepath: {}'.format(gene_name_filepath))
        
        cv.model(x_train_filepath=x_filepath,
                 z_train_filepath=z_filepath, 
                 pen_x = pen_x, 
                 pen_z = pen_z,
                 x_val_filepath = x_val_filepath, 
                 z_val_filepath = z_val_filepath, 
                 gene_name_filepath = gene_name_filepath,
                 verbose=verbose)

## Demo/test the methods before running a bigger set

In [None]:
cv_demo = CrossValCCA(raw_data_path = '../../data/cross_val_data/', 
                 uv_dir = './crossvalidation_u_and_v_vectors',
                 input_filepath='../../data/cross_val_data/',
                 path_to_R_script='../../code/sparse_CCA.R' # default value,
                )

In [None]:
# The best control is zero penalty.  Should give one nonzero weight. 
run_CCA(summary_df=summary, cv=cv_demo, groups=[1, 2], pen_x=0., pen_z=0., verbose=True)

In [None]:
run_CCA(summary_df=summary, cv=cv_demo, groups=[1], pen_x=0.04, pen_z=0.03, verbose=True)

In [None]:
cv_demo.results

In [None]:
cv_demo

In [None]:
p = cv_demo.models[1].plot_projections()

In [None]:
cv_demo.models[1].summary

In [None]:
p_x = cv_demo.plot_correlation_vs_penalty(set='train', penalty='x')

In [None]:
p_x = cv_demo.plot_correlation_vs_penalty(set='val', penalty='x')

In [None]:
p_z = cv_demo.plot_num_nonzero_coeffs_vs_penalty(set='z')

## Hyperparameter tuning for 4-fold cross-val

In [None]:
cv = CrossValCCA(raw_data_path = '../../data/cross_val_data/', 
                 uv_dir = './crossvalidation_u_and_v_vectors',
                 input_filepath='../../data/cross_val_data/')

In [None]:
# Don't do a grid any more
#penx_vals = np.linspace(0, 0.04, num=4, endpoint=False)
#penz_vals = np.linspace(0, 0.04, num=4, endpoint=False)
#print('pen_x values:'+ str(penx_vals))
#print('pen_z values:'+ str(penz_vals))

In [None]:
def run_CCA_specify_value_tupes(cross_val_obj, group_list, penx_penz_tuples, verbose=False):
     
    expected_models = len(penx_penz_tuples)
    print("analyze {} model(s) for crossvalidation sets {}".format(expected_models, groups))
    for t in penx_penz_tuples:
        penx, penz = t
        run_CCA(summary_df=summary, cv=cross_val_obj, groups=group_list, 
            pen_x=penx, pen_z=penz, verbose=verbose)

In [None]:
run_CCA_specify_value_tupes(cross_val_obj=cv, 
                            group_list=[1], 
                            penx_penz_tuples=[(0, 0)], 
                            verbose=False)

In [None]:
list(itertools.product([1, 2, 3], [4, 5, 6]))

In [None]:
np.linspace(0, 0.1, num=11, endpoint=False)

In [None]:
for x in np.linspace(0, 0.1, num=11, endpoint=False):
    run_CCA_specify_value_tupes(cross_val_obj=cv, 
                                group_list=[1, 2, 3, 4], 
                                penx_penz_tuples=[(x, x)], 
                                verbose=False)

In [None]:
# Probably don't need to run this function b/c pretty good independence of x-val params.
def run_CCA_with_combinations_of_penalties(cross_val_obj, group_list, 
                                            penx_vals, penz_vals,
                                           verbose=False):
    print(penx_vals, penz_vals)
    combos = list(itertools.product(penx_vals, penz_vals))
    expected_models = len(combos)
    print(expected_models)
    print("analyze {} models for crossvalidation sets {}".format(expected_models, groups))
    
    run_CCA_specify_value_tupes(cross_val_obj=cross_val_obj, 
                                            group_list=group_list, 
                                            penx_penz_tuples=combos, verbose=verbose)

In [None]:
px = cv.plot_num_nonzero_coeffs_vs_penalty(set='x')
pz = cv.plot_num_nonzero_coeffs_vs_penalty(set='x')

In [None]:
pxt = cv.plot_correlation_vs_penalty(set='train', penalty='x')
pxv = cv.plot_correlation_vs_penalty(set='val', penalty='x')

pzt = cv.plot_correlation_vs_penalty(set='train', penalty='z')
pzv = cv.plot_correlation_vs_penalty(set='val', penalty='z')