# Modeling Workflow

In [1]:
%matplotlib inline
import tellurium as te
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import lmfit
import util

IndentationError: unindent does not match any outer indentation level (util.py, line 108)

## Review of Model Fitting

In [None]:
from numpy import exp, sin

x = np.linspace(0, 10, 100)
data = 3*np.cos(x)

def residual(params, x, data):
    """
    :param Parameters params:
    :param array x: independent variable(s)
    :param array data: observed values to fit to
    """
    amp = params['amp']
    phaseshift = params['phase']
    freq = params['frequency']
    decay = params['decay']

    model = amp * sin(x*freq + phaseshift) *exp(-x*x*decay)
 
    return data-model

 
params = lmfit.Parameters()
params.add('amp', value=10)
params.add('decay', value=0.007)
params.add('phase', value=0.2)
params.add('frequency', value=3.0)

out = lmfit.minimize(residual, params, 

In [None]:
import lmfit
import numpy
    
# Synthetic data
x = np.linspace(0, 10, 100)
AMP = 3.0
FREQ = 1.0
PHASE = 0.0
DECAY = 0.1
data = AMP*sin(x*FREQ + PHASE)*exp(-x*x*DECAY)

# Residual calculation
def calculate(x, amp, phase, freq, decay):
    """
    :param Parameters params:
    :param array x: independent variable(s)
    :param array data: observed values to fit to
    """
    return amp*sin(x*freq + phase)*exp(-x*x*decay)


# Fitting
params = lmfit.Parameters()
params.add('amp', value=10)
params.add('freq', value=3)
params.add('phase', value=.2)
params.add('decay', value = 1.0)
#
model = lmfit.Model(calculate)
fitter = model.fit(data, params, x=x) 
fitter.params


In [None]:
fitter.params.get('amp').value

In [None]:
for name in fitter.params:
    param = fitter.params.get(name)
    stmt = "%s = %3.4f" % (name, param.value)
    exec(stmt)
fit = amp*sin(x*freq + phase)*exp(-x*x*decay)
plt.scatter(data, fit)
plt.xlabel("Observed")
plt.ylabel("Fitted")

## Fitting a Simulation of One Species Concentration
1. Make simulation into a function
1. Create observations
1. Create parameters
1. Fit the model
1. Generate fitted simulation data
1. Compare the fit with the data

In [None]:
START = 0
STOP = 50
NUM_POINTS = 100
def simulate(v0=10, ka=0.4, kb=0.32, kc=0.4):
    """
    Simulate the model for specific parameter values.
    :param array x: dummy independent variable
    :return array-float: [B]
    """
    model = """
    model test
        species A, B, C;

        J0: -> A; v0
        A -> B; ka*A;
        B -> C; kb*B;
        J1: C ->; C*kc
        A = 0
        B = 0
        C = 0
        v0 = 10
        ka = 0.4
        kb = 0.8*ka
        kc = ka

    end
    """
    r = te.loada(model)
    r.v0 = v0
    r.ka = ka
    r.kb = kb
    r.kc =kc
    results = r.simulate(START, STOP, NUM_POINTS)
    return results["[B]"]

In [None]:
STD = 1.0
data = simulate() + np.random.normal(0, STD, NUM_POINTS)

In [None]:
# Create the parameters
params = lmfit.Parameters()
params.add('v0', value=10)
params.add('ka', value=.1)
params.add('kb', value=.2)
params.add('kc', value = 1.0)

In [None]:
# Do the fit
model = lmfit.Model(simulate, independent_vars=[])  # Specifying no independent variables
fitter = model.fit(data, params) 
fitter.params

In [None]:
# Generate fitted data by running the simulation with fitted parameters
kwargs = {}
for name in fitter.params:
    param = fitter.params.get(name)
    kwargs[name] = param.value
fit = simulate(**kwargs)
fit

## Fit Multiple Concentrations

In [None]:
def fitModel(modelstr, params, train, weights={}):
    """
    :param str modelstr: Model
    :param array-int: indices in training data
    :param dict weights: key is response variable; value is weight in residuals
    :return lmfit.minimizer.Minimizer: .Parameters are the parameters
    """
    def calcResiduals(params):
        results = util.simulate(modelstr, params)
        residuals = np.repeat(0, len(train))
        times = results['time']
        for name in results.colnames[1:]:
            multiplier = 1.0
            if name in weights.keys():
                multiplier = weights[name]*results[name]
            residuals += multiplier*results[name][train]
    minner = lmfit.Minimizer(calcResiduals, params)

In [None]:
# create data to be fitted
x = np.linspace(0, 15, 301)
data = (5. * np.sin(2*x - 0.1) * np.exp(-x*x*0.025) +
        np.random.normal(size=len(x), scale=0.2))


# define objective function: returns the array to be minimized
def fcn2min(params, x, data):
    """Model a decaying sine wave and subtract data."""
    amp = params['amp']
    shift = params['shift']
    omega = params['omega']
    decay = params['decay']
    model = amp * np.sin(x*omega + shift) * np.exp(-x*x*decay)
    return model - data


# create a set of Parameters
params = lmfit.Parameters()
params.add('amp', value=10, min=0)
params.add('decay', value=0.1)
params.add('shift', value=0.0, min=-np.pi/2., max=np.pi/2)
params.add('omega', value=3.0)

# do fit, here with leastsq model
minner = lmfit.Minimizer(fcn2min, params, fcn_args=(x, data))
result = minner.minimize()

# calculate final result
final = data + result.residual

# write error report
lmfit.report_fit(result)

# try to plot results
try:
    import matplotlib.pyplot as plt
    plt.plot(x, data, 'k+')
    plt.plot(x, final, 'r')
    plt.show()
except ImportError:
    pass

In [None]:
minner

Notes
1. Can adapt to fitting for multiple response variables
1. Parameters provide a way of restricting the values considered for the fit (e.g., must be positive)

## Cross Validation Codes

In [None]:
def foldGenerator(num_points, num_folds):
    """
    :param int num_points:
    :param int num_folds:
    :return array, array: training indices, test indices
    """
    indices = range(num_points)
    for remainder in range(num_folds):
        test_indices = []
        for idx in indices:
            if idx % num_folds == remainder:
                test_indices.append(idx)
        train_indices = np.array(list(set(indices).difference(test_indices)))
        test_indices = np.array(test_indices)
        yield train_indices, test_indices
#
generator = foldGenerator(10, 5)
for g in generator:
    print(g)

## Set up data

In [None]:
# Detailed simulation model

te.setDefaultPlottingEngine('matplotlib')
model = """
model test
    species A, B, C;

    J0: -> A; v0
    A -> B; ka*A;
    B -> C; kb*B;
    J1: C ->; C*kc
    ka = 0.4;
    v0 = 10
    kb = 0.8*ka
    kc = ka

end
"""
r = te.loada(model)
result = r.simulate(0, 50, 100)

In [None]:
r.setValue("ka", 0.3)
r.ka

In [None]:
for col in COLUMN_NAMES:
    plt.plot(result['time'], result[col])
plt.xlabel("Time")
plt.ylabel("Concentration")
plt.legend(COLUMN_NAMES)

In [None]:
# Set-up the data
if True:
    STD = 5
    result = getSimulationData()
    length = len(result)
    XV = result['time']
    XV = XV.reshape(length, 1)
    ERRORS = np.array(np.random.normal(0, STD, length))
    YV_PURE = result['[B]']
    YV = YV_PURE + ERRORS
    YV_PURE = YV_PURE.reshape(length, 1)
    YV = YV.reshape(length, 1)

## Cross Validation

In [None]:
# Does a polynomial regression of the specified order
def buildMatrix(xv, order):
    """
    :param array-of-float xv:
    :return matrix:
    """
    length = len(xv)
    xv = xv.reshape(length)
    constants = np.repeat(1, length)
    constants = constants.reshape(length)
    data = [constants]
    for n in range(1, order+1):
        data.append(xv*data[-1])
    mat = np.matrix(data)
    return mat.T

def regress(xv, yv, train, test, order=1):
    """
    :param array-of-float xv: predictor values
    :param array-of-float yv: response values
    :param array-of-int train: indices of training data
    :param array-of-int test: indices of test data
    :param int order: Order of the polynomial regression
    return float, array-float, array-float: R2, y_test, y_preds
    """  
    regr = linear_model.LinearRegression()
    # Train the model using the training sets
    mat_train = buildMatrix(xv[train], order)
    regr.fit(mat_train, yv[train])
    mat_test = buildMatrix(XV[test], order)
    y_pred = regr.predict(mat_test)
    rsq = r2_score(YV[test], y_pred)
    return rsq, yv[test], y_pred, regr.coef_

In [None]:
generator = foldGenerator(100, 4)
for train, test in generator:
    rsq, yv_test, yv_pred, coef_ = regress(XV, YV, train, test, order=3)
    plt.figure()
    plt.scatter(test, yv_pred, color = 'b')
    plt.scatter(test, yv_test, color = 'r')
    plt.title("RSQ: %2.4f" % rsq)

## Bootstrapping

In [None]:
# Compute residuals
train = range(len(XV))
test = range(len(XV))
rsq, yv_test, yv_pred, _ = regress(XV, YV, train, test, order=3)
residuals = yv_test - yv_pred
plt.scatter(test, residuals)
_ = plt.title("%2.4f" % rsq)

In [None]:
# Generate synthetic data from residuals
def generateData(y_obs, y_fit):
    """
    :param np.array y_obs
    :param np.array y_fit
    :return np.array: bootstrap data
    """
    residuals = y_obs - y_fit
    length = len(y_obs)
    residuals = residuals.reshape(length)
    samples = np.random.randint(0, length, length)
    result = y_fit + residuals[samples]
    result = result.reshape(length)
    return result

In [None]:
y_obs = np.array([1, 2, 3])
y_fit = np.array([.9, 2.4, 3.2])
for _ in range(4):
    print (generateData(y_obs, y_fit))

In [None]:
train = range(len(XV))
rsq, yv_test, yv_pred, _ = regress(XV, YV, train, train, order=3)
plt.scatter(YV, generateData(YV, yv_pred))
plt.title("Original")
for _ in range(4):
    plt.figure()
    plt.scatter(YV, generateData(YV, yv_pred))

In [None]:
# Estimate the parameters for each random data set
train = range(len(XV))
coefs = []
_, _, y_fit, _ = regress(XV, YV, train, train, order=3)
for _ in range(10):
    yv = generateData(YV, y_fit)
    _, _, _, coef_ = regress(XV, yv, train, train, order=3)
    coefs.append(coef_)
coefs