## Cross-validation of sparse CCA

Predict expression of methylotrophy genes from methanotrophy genes.  Previously pooled on species.

First, run: ../notebooks/prepare_data/prepare_X_m_expression_Y_nmm_expression_Xval.ipynb

That fills /Users/janet/ML_meta-omics/data/cross_val_data

In [None]:
import pandas as pd

In [None]:
import matplotlib as mpl
import matplotlib
matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import subprocess

In [None]:
! ls ../../data/cross_val_data/

In [None]:
import os

In [None]:
raw_data_path = '../../data/cross_val_data/'
xval_filenames = os.listdir(raw_data_path)

In [None]:
xval_filenames[0]

In [None]:
def find_Xval_group(string):
    m = re.search('[_A-z]+fold([0-9]+)[._A-z]+', string)
    return m.group(1)
    

In [None]:
find_Xval_group(xval_filenames[0])

In [None]:
groups = sorted(list(set([find_Xval_group(f) for f in xval_filenames])))
groups

In [None]:
xval_filenames

In [None]:
[f for f in xval_filenames if 'methanotroph' in f and "fold3" + "_train" in f]

In [None]:
def get_filename(string, fold, train=True):
    if train:
        suffix = '_train'
    else:
        suffix = '_val'
    filename_candidates = [f for f in xval_filenames
                           if string in f and "fold" + str(fold) + suffix in f]
    assert len(filename_candidates) == 1
    return filename_candidates[0]

In [None]:
get_filename('methanotroph', 3, train=True)

In [None]:
get_filename('methylotroph', 3, train=True)

In [None]:
summary = pd.DataFrame()

for fold in groups:
    print('fold: {}'.format(fold))
    row = {}
    row['k'] = fold
    row['m filename, train'] = get_filename('methanotroph', fold, train=True)
    row['m filename, val'] = get_filename('methanotroph', fold, train=False)
    row['nmm filename, train'] = get_filename('methylotroph', fold, train=True)
    row['nmm filename, val'] = get_filename('methylotroph', fold, train=False)
    row['train x'] = pd.read_csv(raw_data_path + row['m filename, train'], sep='\t')
    row['train z'] = pd.read_csv(raw_data_path + row['nmm filename, train'], sep='\t')
    row['val x'] = pd.read_csv(raw_data_path + row['m filename, val'], sep='\t')
    row['val z'] = pd.read_csv(raw_data_path + row['nmm filename, val'], sep='\t')
    
    # make a dataframe row
    row = {k:[v] for k, v in row.items()}
    df = pd.DataFrame.from_dict(row)
    #print(df)
    summary = pd.concat([summary, df], axis=0)
summary

### Run sparse CCA on each of these training sets

Start by demo of one set:

In [None]:
def run_CCA(summary, groups, run_R=True):
    results = {}
    for group in groups:
        fold_results = {}
        
        row = summary[summary['k'] == group]
        x = row['train x']
        z = row['train z']
        x_path = raw_data_path + row['m filename, train'][0]
        z_path = raw_data_path + row['nmm filename, train'][0]
        u_path = x_path.replace('_train.tsv', '_train_u.tsv')
        v_path = z_path.replace('_train.tsv', '_train_v.tsv')
        
        penalty_x = 0.0335 # whole-data value
        penalty_z = 0.022  # whole-data value
        
        # todo: remove expected filename if it exsits.
        
        command = ['Rscript', '../../code/sparse_CCA.R'
                   , x_path, z_path, '0.04', '0.03']
        print('command: \n {}'.format(" ".join(command)))
        if run_R:
            subprocess.check_call(command)
            
        # get results
        print(u_path)
        u = pd.read_csv(u_path, sep='\t')
        print("shape of u: {}".format(u.shape))
        fold_results['u'] = u
        
        u.rename(columns={'V1':'u'}, inplace=True)
        v = pd.read_csv(v_path, sep='\t')
        v.rename(columns={'V1':'v'}, inplace=True)
        fold_results['v'] = v
        
        results[group] = fold_results
    return results
    
results = run_CCA(summary, groups[0], run_R=False)

In [None]:
results['1']['u'].head()

In [None]:
results['1']['v'].head()

In [None]:
# --- old stuff below ----
assert False

In [None]:
for fold in groups:
    fold_string = "fold" + fold
    print(fold_string)
    
    # get the methanotrophy filename
    m_filename = [f for f in xval_filenames 
                  if 'methanotroph' in f and fold_string + "_train" in f]
    assert len(m_filename) == 1
    m_filename = m_filename[0]
    m_train = []
for filename in os.listdir('../../data/cross_val_data/'):
    fold = 
    if filename.endswith(".asm") or filename.endswith(".py"):

In [None]:
only_8_weights = True

if only_8_weights:
    u = pd.read_csv('./results/u_penalties_0_03-0_02.csv')
    v = pd.read_csv('./results/v_penalties_0_03-0_02.csv')
else:
    u = pd.read_csv('./results/u.csv')
    v = pd.read_csv('./results/v.csv')

In [None]:
u.head()

In [None]:
v.head()

In [None]:
methanotroph_names = \
    pd.read_csv('../../data/m_nmm_expression--sum_by_gene/methanotroph_gene_names.tsv',
               sep='\t', header=None)
methanotroph_names.head()

In [None]:
methylotroph_names = \
    pd.read_csv('../../data/m_nmm_expression--sum_by_gene/methylotroph_gene_names.tsv',
               sep='\t', header=None)
methylotroph_names.head()

In [None]:
u.shape

In [None]:
methanotroph_names.shape

In [None]:
m = pd.concat([methanotroph_names, u], axis=1)
m.rename(columns={'V1':'weight'}, inplace=True)
m['abs(weight)'] = m.weight.abs()
m
m.shape

In [None]:
nmm = pd.concat([methylotroph_names, v], axis=1)
nmm.rename(columns={'V1':'weight'}, inplace=True)
nmm['abs(weight)'] = nmm.weight.abs()
nmm.shape

In [None]:
m.head()

In [None]:
m = m.sort_values(by='abs(weight)', ascending=False)
m.head(10)

In [None]:
nmm = nmm.sort_values(by='abs(weight)', ascending=False)
nmm.head(10)