## Cross-validation of sparse CCA

Predict expression of methylotrophy genes from methanotrophy genes.  Previously pooled on species.

First, run: ../notebooks/prepare_data/prepare_X_m_expression_Y_nmm_expression_Xval.ipynb

That fills /Users/janet/ML_meta-omics/data/cross_val_data

In [None]:
import sys
assert sys.version_info.major == 2, "python version is {}".format(sys.version_info.major)
print(sys.version)

In [None]:
import pandas as pd

In [None]:
import matplotlib as mpl
import matplotlib
matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import subprocess
import sys

In [None]:
print(sys.version)

In [None]:
sys.path.append('../../code/')

from CCA import CcaAnalysis, ExpressionCCA

In [None]:
raw_data_path = '../../data/cross_val_data/'
os.listdir(raw_data_path)
xval_filenames = [f for f in os.listdir(raw_data_path) if '.tsv' in f]  # get rid of non-TSV files

In [None]:
def find_Xval_group(string):
    m = re.search('[_A-z]+fold([0-9]+)[._A-z]+', string)
    return int(m.group(1))

groups = list(set([find_Xval_group(f) for f in xval_filenames]))
groups

In [None]:
def get_filename(strings_list, fold, train=True):
    raw_data_path = '../../data/cross_val_data/'
    xval_filenames = os.listdir(raw_data_path)
    
    potential_files = [f for f in xval_filenames
                       if "fold" + str(fold) in f]
    #print("There are {} potential files".format(len(potential_files)))
    
    for s in strings_list:
        potential_files = [f for f in potential_files if s in f]
        
    assert len(potential_files) == 1, "too many file name matches in {}".format(potential_files)
    
    #print("file names found: {}".format(potential_files))
    return potential_files[0]

In [None]:
get_filename(['methanotroph', 'train', 'filtered', 'ss'], fold=1, train=True)

In [None]:
[1] + [2, 3, 4]

In [None]:
summary = pd.DataFrame()

for fold in groups:
    file_specs = ['filtered', 'ss']
    #print('fold: {}'.format(fold))
    row = {}
    row['k'] = fold
    row['m filename, train'] = get_filename(['methanotroph', 'train'] + file_specs, fold)
    row['m filename, val'] = get_filename(['methanotroph', 'val'] + file_specs, fold)
    row['nmm filename, train'] = get_filename(['methylotroph', 'train'] + file_specs, fold)
    row['nmm filename, val'] = get_filename(['methylotroph', 'val'] + file_specs, fold)
    row['gene names'] = get_filename(['methylotroph', 'genes'] + file_specs, fold)
    
    # make a dataframe row
    row = {k:[v] for k, v in row.items()}
    df = pd.DataFrame.from_dict(row)
    #print(df)
    summary = pd.concat([summary, df], axis=0)
summary

### Run sparse CCA on each of these training sets

Start by demo of one set:

In [None]:
summary.head(1)

In [None]:
uv_dir = './crossvalidation_u_and_v_vectors/'
if not os.path.exists(uv_dir):
    os.mkdir(uv_dir)

In [None]:
os.listdir(uv_dir)

In [None]:
def run_CCA(summary_df, groups, pen_x=0.04, pen_z=0.03, verbose=False):
    results = {}  # make a dict of CCA objects
    for group in groups:
        fold_results = {}
        
        row = summary_df[summary_df['k'] == group]
        assert row.shape[0] == 1
        
        def prepare_filepath(set):
            filename = row[set][0]
            return os.path.join(raw_data_path, filename)
        
        x_filepath = prepare_filepath('m filename, train')
        z_filepath = prepare_filepath('nmm filename, train')
        
        # todo: remove expected filename if it exsits.
        cca = ExpressionCCA(x_train_filename = x_filepath, 
                            z_train_filename = z_filepath,
                            x_val_filename = prepare_filepath('m filename, val'), 
                            z_val_filename = prepare_filepath('nmm filename, val'),
                            gene_filepath = prepare_filepath('gene names'),
                            input_filepath = raw_data_path, 
                            u_v_output_dir = uv_dir,
                            penalty_x = pen_x,  # leads to thousands of nonzero weights in X-val
                            penalty_z = pen_z, # leads to thousands of nonzero weights in X-val 
                            verbose = verbose,
                            path_to_R_script='../../code/sparse_CCA.R')
        
        results[group] = cca
    return results
    
# run_R doesn't seem to control anything!!
results = run_CCA(summary, groups, verbose=True)

In [None]:
results

In [None]:
for n, c in results.items():
    print("--- {} ---".format(n))
    c.summarise()
    print(c.summary)
    c.plot_projections(filename='projection_for_set_{}.pdf'.format(n))

In [None]:
results[1].summary

In [None]:
def prep_for_pandas(mydict):
    return {k:[v] for k, v in mydict.items()}

In [None]:
prep_for_pandas(results[1].summary)

In [None]:
pd.DataFrame.from_dict(prep_for_pandas(results[1].summary))

In [None]:
d1 = pd.DataFrame.from_dict(prep_for_pandas(results[1].summary))
d2 = pd.DataFrame.from_dict(prep_for_pandas(results[2].summary))

In [None]:
pd.concat([d1, d2])

In [None]:
def prep_df_row(mydict):
    return pd.DataFrame.from_dict({k:[v] for k, v in mydict.items()})

In [None]:
prep_df_row(results[2].summary)

In [None]:
def combine_info(results, verbose=False):
    mega_df = pd.DataFrame()

    for model_num in results.keys():
        model = results[model_num]
        summary_dict = model.get_summary()
        df_row = prep_df_row(summary_dict)
        df_row['fold #'] = model_num
        df_row['pen_x'] = model.penalty_x
        df_row['pen_z'] = model.penalty_z
        
        if verbose:
            print("df_row: {}".format(df_row))
        mega_df = pd.concat([mega_df, df_row])
        
    return mega_df

In [None]:
results2 = run_CCA(summary, groups, pen_x=0.01, pen_z=0.01)

In [None]:
for n, c in results2.items():
    print("--- {} ---".format(n))
    c.summarise()
    print(c.summary)
    #c.plot_projections(filename='projection_for_set_{}.pdf'.format(n))

In [None]:
i2 = combine_info(results2)

In [None]:
pd.concat([i1, i2])

## Hyperparameter tuning for 4-fold cross-val

In [None]:
penx_vals = np.linspace(0, 0.04, num=5, endpoint=False)
penz_vals = np.linspace(0, 0.03, num=5, endpoint=False)

In [None]:
print('pen_x values:'+ str(penx_vals))
print('pen_z values:'+ str(penz_vals))

In [None]:
mega_mega_df = pd.DataFrame()

for penx in penx_vals:
    for penz in penz_vals:
        print("Computing results for penx = " + str(penx) + " and penz = " + str(penz) + "...")
        results = run_CCA(summary, groups, pen_x = penx, pen_z = penz)
        mega_mega_df = pd.concat([mega_mega_df, combine_info(results, verbose=True)])
        

In [None]:
mega_mega_df

In [None]:
fold1_df = mega_mega_df.loc[mega_mega_df['fold #'] == 1]
fold2_df = mega_mega_df.loc[mega_mega_df['fold #'] == 2]
fold3_df = mega_mega_df.loc[mega_mega_df['fold #'] == 3]
fold4_df = mega_mega_df.loc[mega_mega_df['fold #'] == 4]

In [None]:
plt.plot(fold1_df['pen_x'], fold1_df['# nonzero u weights'],'bo', label="Fold 1")
plt.plot(fold2_df['pen_x'], fold2_df['# nonzero u weights'],'ro', label="Fold 2")
plt.plot(fold3_df['pen_x'], fold3_df['# nonzero u weights'],'go', label="Fold 3")
plt.plot(fold4_df['pen_x'], fold4_df['# nonzero u weights'],'yo', label="Fold 4")
plt.ylabel("Number of Nonzero U weights")
plt.xlabel("Regularization penalty on U")
plt.legend(loc = 'best')

In [None]:
plt.plot(fold1_df['pen_z'], fold1_df['# nonzero v weights'],'bo', label="Fold 1")
plt.plot(fold2_df['pen_z'], fold2_df['# nonzero v weights'],'ro', label="Fold 2")
plt.plot(fold3_df['pen_z'], fold3_df['# nonzero v weights'],'go', label="Fold 3")
plt.plot(fold4_df['pen_z'], fold4_df['# nonzero v weights'],'yo', label="Fold 4")
# plt.ylim([0,11000])
plt.ylabel("Number of Nonzero V weights")
plt.xlabel("Regularization penalty on Z")
plt.legend(loc = 'best')

In [None]:
fold4_df