In [5]:
import sys
sys.path.append('/Users/leah/Columbia/courses/19summer/microbialdynamics')

In [69]:
import os
import pickle
import numpy as np
from scipy.special import logsumexp

import matplotlib.pyplot as plt
import seaborn as sns

import scipy

from src.utils.data_loader import load_data
from src.utils.data_interpolation import interpolate_data
from src.utils.available_data import DATA_DIR_DICT, PERCENTAGE_DATA_TYPE, COUNT_DATA_TYPE

In [84]:
# x is log additive ratio
def x_to_p(x):
    # x shape (T, Dx)
    # return p shape (T, Dx+1)
    T, Dx = x.shape
    p_11 = 1 / (1 + np.sum(np.exp(x), axis=-1, keepdims=True)) # (T, 1)
    p_first10 = p_11 * np.exp(x)  # (T, 10)
    p = np.concatenate((p_first10, p_11), axis=1)
    assert p.shape == (T, Dx+1)
    return p

In [162]:
def compute_mse(yhat, ytrue):
    """
    yhat: (T, Dy)
    ytrue: (T, Dy)
    return: mse: ()
            var: (Dy, )
    """
    
    time = yhat.shape[0]
    
    mse = np.sum((yhat - ytrue)**2)  
    
    mean = np.mean(ytrue, axis=0)  # (Dy,)
    var = np.sum((ytrue - mean)**2, axis=0)  # (Dy, )
    return time, mse, mean, var    
    

In [166]:
def compute_Rsq(mse_and_stuff, Dy):
    combined_MSE = np.zeros((1, ))             # combined MSE_ks across all batches
    combined_y_means = np.zeros((Dy, ))        # combined y_means across all batches
    combined_y_vars = np.zeros((Dy, ))         # combined y_vars across all batches
    
    i = 0
    for time, mse, mean, var in mse_and_stuff:
        combined_MSE +=  mse
        
        n1 = time * i
        n2 = time
        
        combined_y_means_new = (n1 * combined_y_means + n2 * mean) / (n1 + n2)
        
        combined_y_vars = combined_y_vars + var + n1 * (combined_y_means - combined_y_means_new)**2 + n2 * (mean - combined_y_means_new)**2

        combined_y_means = combined_y_means_new
        
        i += 1
    
    combined_y_vars = np.mean(combined_y_vars, axis=0)
    R_square = 1 - combined_MSE / combined_y_vars

    return R_square        
    

In [167]:
def compute_0step_Rsq(datatype):
    # load hidden states
    general_data_dir = "/Users/leah/Columbia/courses/19summer/microbialdynamics/"

    data_dir = DATA_DIR_DICT[datatype]
    data_dir = os.path.join(general_data_dir, data_dir)
    Dx = 10
    Dy = 11

    hidden_train, hidden_test, obs_train, obs_test, input_train, input_test, extra_inputs_train, extra_inputs_test = load_data(data_dir, Dx, False, training_sample_idx=None)
    
    ytrue = [obs[:,1:] for obs in obs_train]
    if datatype in COUNT_DATA_TYPE:
        for i, y in enumerate(ytrue):
            # (T, Dy)
            ytrue[i] = y / np.sum(y, axis=-1, keepdims=True)
    
    # percentage Rsq
    percentages = list(map(x_to_p, hidden_train))
    mse_and_stuff = list(map(compute_mse, percentages, ytrue))    
    percentage_Rsq = compute_Rsq(mse_and_stuff, Dy)
    
    # log percentage Rsq
    log_percentages = [np.log(p) for p in percentages]
    log_ytrue = [np.log(y) for y in ytrue]
    logp_mse_and_stuff = list(map(compute_mse, log_percentages, log_ytrue))    
    logp_Rsq = compute_Rsq(logp_mse_and_stuff, Dy)
    
    # aitchison distance Rsq
    a_hat = [log_p - np.mean(log_p, axis=-1, keepdims=True) for log_p in log_percentages]
    a_true = [log_y - np.mean(log_y, axis=-1, keepdims=True) for log_y in log_ytrue]
    a_mse_and_stuff = list(map(compute_mse, a_hat, a_true))
    a_Rsq = compute_Rsq(a_mse_and_stuff, Dy)
    
    return percentage_Rsq, logp_Rsq, a_Rsq
    

In [168]:
# test
p_Rsq, logp_Rsq, a_Rsq = compute_0step_Rsq("clv_percentage_Dx_10")

  
  from ipykernel import kernelapp as app
