# Consumption and growth rates
The goal of this notebook is to calculate experimental growth rate and experimental substrate uptake rate<br>


<br>
Input: time vs. substrate data and time vs. biomass data
<br>
Output: consumption rate with unit $\frac{mmol substrate consumed}{g biomass*hr}$  and the growth rate with unit hr$^-$$^1$.

### Setup

In [None]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import math
from scipy.stats import linregress
from scipy import mean
import pymc3 as pm

GRAMS_BIOMASS_PER_LITER_PER_OD = 0.35 # 1 OD = 0.35 g/L of biomass

### pymc3 exercises

In [None]:
obs_y = np.random.normal(0.5, 0.4, 10)

In [None]:
with pm.Model() as excercise1:
    stdev = pm.HalfNormal('stdev', tau=1)
    mu = pm.Normal('mu', mu=0.0, sd=1)
    
    y = pm.Normal('y', mu=mu, sd=stdev, observed=obs_y)
    
    trace = pm.sample(1000)
    
    pm.traceplot(trace, ['mu', 'stdev'])
    plt.show()
    

### Linear regression Exercise 2

In [None]:
N = 10000

noise = np.random.normal(0.0, 0.1, N)
# noise = np.random.normal(0.0, 0.1, N)
X = np.random.normal(1.0, 0.1, N)
obs_y = (0.65 * X) + noise

with pm.Model() as exercise2:
    
    stdev = pm.HalfNormal('stdev', sd=1.)
    intercept = pm.Normal('intercept', mu=0.0, sd=2.)
    coeff = pm.Normal('beta', mu=0.5, sd=2.)
    
    expected_value = (X * coeff) + intercept
    y = pm.Normal('y', mu=expected_value, sd=stdev, observed=obs_y)  #how is x being passed in here?
    
    trace = pm.sample(1000, tune=4000)
    
    pm.traceplot(trace, ['beta', 'stdev', 'intercept'])
    plt.show()

How to predict data

In [None]:
with exercise2:
#     posterior predictive check
    ppc = pm.sample_ppc(trace, samples=1000)
    y_preds = ppc['y'] 
    
    # y_preds has shape 1000 by 10000
    print(y_preds.shape)
    #for each sample of the posterior (1000 total) predict all y values (10000 in total)
    
    # Taking the mean, you find the mean of the 1000 samples for all values of y
    # resulting thing is a vector with shape 1 by 10000
    expected_y_pred = np.reshape(np.mean(y_preds, axis=0), [-1])
    
    plt.scatter(X, expected_y_pred, c='g')
    plt.scatter(X, obs_y, c='b', alpha=0.1)
    plt.title('Relationship between X and predicted Y')

### Exercise 3: Logistic regression

### Yoneda et al. growth curves
Load OD and substrate concentration curves

In [None]:
od_filename = '../../EDD_Yoneda_data/Yoneda_set2_ODD_data.csv'
substrate_filename = '../../EDD_Yoneda_data/Yoneda_set2_Metabolomics_data.csv'

od_df = pd.read_csv(od_filename)
sub_df = pd.read_csv(substrate_filename)

od_df['Biomass Conc'] = GRAMS_BIOMASS_PER_LITER_PER_OD*od_df['Value']

print(f'substrate data has {len(sub_df)} lines')
print(f'OD data has {len(od_df)} lines')

### Define function that takes in a single trial and returns growth rate and substrate yield

In [None]:
def stats_for_trial(growth_data, substrate_data, molar_mass, display=False):
    
    biomass_values = growth_data['Biomass Conc']
    biomass_times = growth_data['Time']
    biomass_init = list(biomass_values)[0]

    substrate_values = substrate_data['Value']*1000/molar_mass
    substrate_times = substrate_data['Time']
    substrate_init = list(substrate_values)[0]
    
    # growth is the slope of log(biomass) vs. time
    growth_rate, _, _, _, _ = linregress(biomass_times, [math.log(val) for val in biomass_values])
    
    # biomass X = X0*e^(μ*t)
    biomass_sim = [biomass_init*math.exp(growth_rate*time) for time in biomass_times]
    
    # actual consumption = S0 - S
    sub_consumed = [substrate_init - sub_value for sub_value in substrate_values]
    
    # new biomass X = X0 - X
    biomass_sim_growth = [sim_value - biomass_init for sim_value in biomass_sim ]
    
    # yield is the amount of biomass that can be made from a mmol of substrate
    yield_coeff, _, _, _, _ = linregress(sub_consumed, biomass_sim_growth)

    # S = S0 - (1/yield)*X
    substrate_sim = [substrate_init - 1/yield_coeff*val for val in biomass_sim_growth]
    
    # units work out to mmol substrate consumed / (g biomass * hr)
    substrate_consumption_rate = (1/yield_coeff) * growth_rate

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 5))
    axes[0].plot(biomass_times, biomass_values, 'o', color='black')
    axes[0].plot(biomass_times, biomass_sim, '-', color='black')
    axes[1].plot(substrate_times, substrate_values, 'o', color='blue')
    axes[1].plot(substrate_times, substrate_sim, '-', color='blue')
    axes[0].set_title('Biomass growth')
    axes[1].set_title('Phenol consumption')
    axes[0].set_xlabel('Time (hr)')
    axes[1].set_xlabel('Time (hr)')
    axes[0].set_ylabel('Biomass (g/L)')
    axes[1].set_ylabel('Phenol (mmol/L)')
    fig.tight_layout()
    
    if display:
        print(f'growth_rate = {growth_rate:.3f} hr-1')
        print(f'yield coefficient = {yield_coeff:.3f} g biomass / mmol substrate')
        print(f'substrate consumption rate = {substrate_consumption_rate:.3f} mmol substrate/gram biomass * hr')
    else:
        return growth_rate, yield_coeff, substrate_consumption_rate

### Define function to calculate growth_rate, yield coefficient, and substrate consumption for a given condition
take in 3 trial names and molar mass, and output growth rate, yield coefficients, and substrate consumption rate

In [None]:
def stats_for_condtion(od_df, sub_df, trial_1, trial_2, trial_3, molar_mass, max_time=0):
    
    if max_time != 0:
        od_df = od_df[od_df['Time'] < max_time]
        sub_df = sub_df[sub_df['Time'] < max_time]
        
    od_1 = od_df[od_df['Line Name'] == trial_1]
    sub_1 = sub_df[sub_df['Line Name'] == trial_1]

    od_2 = od_df[od_df['Line Name'] == trial_2]
    sub_2 = sub_df[sub_df['Line Name'] == trial_2]

    od_3 = od_df[od_df['Line Name'] == trial_3]
    sub_3 = sub_df[sub_df['Line Name'] == trial_3]

    gr_1, yc_1, scr_1 = stats_for_trial(od_1, sub_1, molar_mass)
    gr_2, yc_2, scr_2 = stats_for_trial(od_2, sub_2, molar_mass)
    gr_3, yc_3, scr_3 = stats_for_trial(od_3, sub_3, molar_mass)

    print(f'growth_rate = {np.average([gr_1, gr_2, gr_3]):.3f} ± {np.std([gr_1, gr_2, gr_3]):.3f} hr-1')
    print(f'yield coefficient = {np.average([yc_1, yc_2, yc_3]):.3f} ± {np.std([yc_1, yc_2, yc_3]):.3f} g biomass / mmol substrate')
    print(f'substrate consumption rate = {np.average([scr_1, scr_2, scr_3]):.3f} ± {np.std([scr_1, scr_2, scr_3]):.3f} mmol substrate/gram biomass * hr')

### Stats for WT phenol condition

In [None]:
stats_for_condtion(od_df, sub_df, 'WT-R1', 'WT-R2', 'WT-R3', 94.11)

### Stats for Evol33 phenol condition

In [None]:
stats_for_condtion(od_df, sub_df, 'EVOL33-R1', 'EVOL33-R2', 'EVOL33-R3', 94.11, max_time=60)

### Stats for Evol40 phenol condition

In [None]:
stats_for_condtion(od_df, sub_df, 'EVOL40-R1', 'EVOL40-R2', 'EVOL40-R3', 94.11,  max_time=60)

### Glucose data looks wrong. 
Why is starting concentration ~0.05 g/L? This is much too low to be reasonable

### Stats for WT glucose condition

In [None]:
od_1 = od_df[od_df['Line Name'] == 'WT-G']
sub_1 = sub_df[sub_df['Line Name'] == 'WT-G']

stats_for_trial(od_1, sub_1, 180.16, display=True)

### Stats for Evol33 glucose condition

In [None]:
od_1 = od_df[od_df['Line Name'] == 'EVOL33-G']
sub_1 = sub_df[sub_df['Line Name'] == 'EVOL33-G']
stats_for_trial(od_1, sub_1, 180.16, display=True)

### Stats for Evol40 glucose condition

In [None]:
od_1 = od_df[od_df['Line Name'] == 'EVOL40-G']
sub_1 = sub_df[sub_df['Line Name'] == 'EVOL40-G']
stats_for_trial(od_1, sub_1, 180.16, display=True)

### Stats for low N WT glucose condition

In [None]:
od_1 = od_df[od_df['Line Name'] == 'WT-G-N']
sub_1 = sub_df[sub_df['Line Name'] == 'WT-G-N']
stats_for_trial(od_1, sub_1, 180.16, display=True)

### Stats for low N Evol33 glucose condition

In [None]:
od_1 = od_df[od_df['Line Name'] == 'EVOL33-G-N']
sub_1 = sub_df[sub_df['Line Name'] == 'EVOL33-G-N']
stats_for_trial(od_1, sub_1, 180.16, display=True)

### Stats for low N Evol40 glucose condition

In [None]:
od_1 = od_df[od_df['Line Name'] == 'EVOL40-G-N']
sub_1 = sub_df[sub_df['Line Name'] == 'EVOL40-G-N']
stats_for_trial(od_1, sub_1, 180.16, display=True)

## Henson Data
Load OD and substrate concentration curves

In [None]:
od_filename = '../../EDD_Henson_data/Henson_ODD_data.csv'
substrate_filename = '../../EDD_Henson_data/Henson_metabolomics_data.csv'

od_df = pd.read_csv(od_filename)
sub_df = pd.read_csv(substrate_filename)

od_df['Biomass Conc'] = GRAMS_BIOMASS_PER_LITER_PER_OD*od_df['Value']

print(f'substrate data has {len(sub_df)} lines')
print(f'OD data has {len(od_df)} lines')

In [None]:
def stats_for_trial(growth_data, substrate_data, molar_mass, display=False, max_time=0):
    
    biomass_values = growth_data['Biomass Conc']
    biomass_times = growth_data['Time']
    biomass_init = list(biomass_values)[0]

    substrate_values = substrate_data['Value']*1000/molar_mass
    substrate_times = substrate_data['Time']
    substrate_init = list(substrate_values)[0]
    
    # growth is the slope of log(biomass) vs. time
    growth_rate, _, _, _, _ = linregress(biomass_times, [math.log(val) for val in biomass_values])
    
    # biomass X = X0*e^(μ*t)
    # This is different from above to ensure that there is a biomass value for every substrate measurement
    biomass_sim = [biomass_init*math.exp(growth_rate*time) for time in substrate_times]
    
    # actual consumption = S0 - S
    sub_consumed = [substrate_init - sub_value for sub_value in substrate_values]
    
    # new biomass X = X0 - X
    biomass_sim_growth = [sim_value - biomass_init for sim_value in biomass_sim ]
    
    # yield is the amount of biomass that can be made from a mmol of substrate
    yield_coeff, _, _, _, _ = linregress(sub_consumed, biomass_sim_growth)

    # S = S0 - (1/yield)*X
    substrate_sim = [substrate_init - 1/yield_coeff*val for val in biomass_sim_growth]
    
    substrate_consumption_rate = (1/yield_coeff) * growth_rate

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 5))
    axes[0].plot(biomass_times, biomass_values, 'o', color='black')
    axes[0].plot(substrate_times, biomass_sim, '-', color='black')
    axes[1].plot(substrate_times, substrate_values, 'o', color='blue')
    axes[1].plot(substrate_times, substrate_sim, '-', color='blue')
    axes[0].set_title('Biomass growth')
    axes[1].set_title('Phenol consumption')
    axes[0].set_xlabel('Time (hr)')
    axes[1].set_xlabel('Time (hr)')
    axes[0].set_ylabel('Biomass (g/L)')
    axes[1].set_ylabel('Phenol (mmol/L)')
    fig.tight_layout()
    
    if display:
        print(f'growth_rate = {growth_rate:.3f} hr-1')
        print(f'yield coefficient = {yield_coeff:.3f} g biomass / mmol substrate')
        print(f'substrate consumption rate = {substrate_consumption_rate:.3f} mmol substrate/gram biomass * hr')
    else:
        return growth_rate, yield_coeff, substrate_consumption_rate

In [None]:
stats_for_condtion(od_df, sub_df,'WT-P-R1', 'WT-P-R2', 'WT-P-R3', 94.11, max_time = 40)

In [None]:
stats_for_condtion(od_df, sub_df, 'PVHG6-P-R1', 'PVHG6-P-R2', 'PVHG6-P-R3', 94.11, max_time=40)