# Notebook A: Data Processsing
This notebook takes experimental data and converts it to a form that is usable for machine learning algorithm training.
This involves smoothing the data using the Savitzky–Golay filter, and adding slope values by comparing consecutive concentration values, and dividing by time elapsed. 


### Setup Imports

In [1]:
import pandas as pd
import numpy as np
import math
from scipy.signal import savgol_filter 

### Import experimental data

In [2]:
exp_df = pd.read_csv('../data/experimental_data.csv')
exp_df.head(10)

Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM),N2,CO,CO2,H2,flow rate (mL/min)
0,1,1,0.0,0.01,0.0,0.23,0.0,0.06,60,20,15.0,5.0,20
1,1,1,0.57,0.45,12.81,10.89,0.03,0.09,60,20,15.0,5.0,20
2,1,1,0.67,0.44,15.59,16.42,0.03,0.13,60,20,15.0,5.0,20
3,1,1,0.9,0.41,17.11,17.75,0.03,0.07,60,20,15.0,5.0,20
4,1,1,1.58,0.41,10.93,21.59,0.04,0.06,60,20,15.0,5.0,20
5,1,1,1.65,0.39,15.86,44.28,0.05,0.07,60,20,15.0,5.0,20
6,1,1,2.02,0.46,8.13,46.16,0.21,0.6,60,20,15.0,5.0,20
7,1,1,2.67,0.49,10.79,46.13,1.18,3.41,60,20,15.0,5.0,20
8,1,1,4.7,0.64,20.31,34.37,8.44,9.16,60,20,15.0,5.0,20
9,1,1,6.06,0.67,28.88,27.06,14.09,8.03,60,20,15.0,5.0,20


### Define function to smooth a single trial of experimental data using a second order Savitzky-Golay filter

In [3]:
# set up savgol filter to have a window length of 15, polynomial order of 2, 
# and to output the smoothed values.
savgol = lambda x: savgol_filter(x, 15, 2, deriv=0)

def smooth_single_trial(trial_df, delta):
    # get list of time values
    times = list(trial_df.time)

    # get smoothed time values
    max_time = times[-1]
    new_times = np.arange(0,max_time, delta)  

    # temporarily set time as the index of the dataframe
    trial_df.set_index('time', inplace=True, drop=True)

    # set the index of the df to the new times
    trial_df = trial_df.reindex(trial_df.index.union(new_times))

    # find the linear interpolation for all the new data points
    trial_df = trial_df.interpolate()

    # remove experimental time points, unless they are an exact multiple of delta
    times_to_remove = set(times) - (set(times) & set(new_times))
    trial_df = trial_df.loc[~trial_df.index.isin(times_to_remove)]

    # convert linear interpolation to smoothed polynomial fit
    trial_df = trial_df.apply(savgol)
    
    # round negative smoothed values to 0
    trial_df.clip(lower=0, inplace=True)
    
    # Prevent floating point errors later
    trial_df.composition = round(trial_df.composition, 0)
    trial_df.trial = round(trial_df.trial, 0)

    # remove time's status as index column
    trial_df.insert(2, 'time',trial_df.index)
    trial_df.reset_index(inplace=True, drop=True)
    
    return trial_df

### Apply smoothing function to each trial of experimental data

In [4]:
# This parameter tunes the distance between
delta = 0.1

smoothed_df = pd.DataFrame()

# Loop through compositions
for composition in set(exp_df.composition):
    
    # For each composition, loop over trials
    composition_df = exp_df[exp_df.composition == composition]
    for trial in set(composition_df.trial):
        # isolate the data for a single trial
        trial_df = exp_df[(exp_df.composition == composition) & (exp_df.trial == trial)]
        
        # get the smooth data for that trial
        smooth_trial_df = smooth_single_trial(trial_df, delta)
        
        # add the trial to smoothed_df
        smoothed_df = pd.concat([smoothed_df, smooth_trial_df])

smoothed_df.head(10)

Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM),N2,CO,CO2,H2,flow rate (mL/min)
0,1.0,1.0,0.0,0.013426,0.0,0.0,0.000492,0.058332,60.0,20.0,15.0,5.0,20.0
1,1.0,1.0,0.1,0.097327,1.917572,1.660755,0.005883,0.066873,60.0,20.0,15.0,5.0,20.0
2,1.0,1.0,0.2,0.172225,4.989251,4.31316,0.010819,0.074034,60.0,20.0,15.0,5.0,20.0
3,1.0,1.0,0.3,0.238122,7.685937,6.765444,0.015301,0.079814,60.0,20.0,15.0,5.0,20.0
4,1.0,1.0,0.4,0.295016,10.007629,9.017607,0.019327,0.084213,60.0,20.0,15.0,5.0,20.0
5,1.0,1.0,0.5,0.342908,11.954328,11.069648,0.022898,0.087231,60.0,20.0,15.0,5.0,20.0
6,1.0,1.0,0.6,0.381798,13.526034,12.921569,0.026014,0.088869,60.0,20.0,15.0,5.0,20.0
7,1.0,1.0,0.7,0.411686,14.722747,14.573368,0.028676,0.089125,60.0,20.0,15.0,5.0,20.0
8,1.0,1.0,0.8,0.426219,15.684459,16.189576,0.030462,0.086824,60.0,20.0,15.0,5.0,20.0
9,1.0,1.0,0.9,0.432018,16.00152,16.742923,0.031439,0.082842,60.0,20.0,15.0,5.0,20.0


### Define function to add production rates to a single trial

In [5]:
def add_rates_to_trial(trial_df, delta):
    
    trial_rates_df = trial_df.copy()

    biomass_rates = []
    ethanol_rates = []
    acetate_rates = []
    butanol_rates = []
    butyrate_rates = []
    
    
    # loop over rows of dataframe
    for index, row in trial_df.iterrows():
        # if it is the first row, don't calculate slope
        if row.time == 0:
            biomass_rates.append(0)
            ethanol_rates.append(0)
            acetate_rates.append(0)
            butanol_rates.append(0)
            butyrate_rates.append(0)
        # find diffence between current and previous value divide by the delta value to get production rate
        else:
            previous_row = trial_df[trial_df.index == index-1]
            biomass_rates.append(float((row['biomass (g/L)'] - previous_row['biomass (g/L)'])/delta))
            ethanol_rates.append(float((row['ethanol (mM)'] - previous_row['ethanol (mM)'])/delta))
            acetate_rates.append(float((row['acetate (mM)'] - previous_row['acetate (mM)'])/delta))
            butanol_rates.append(float((row['butanol (mM)'] - previous_row['butanol (mM)'])/delta))
            butyrate_rates.append(float((row['butyrate (mM)'] - previous_row['butyrate (mM)'])/delta))

    # add production rates to the data frame
    trial_rates_df['biomass rate'] = biomass_rates
    trial_rates_df['ethanol rate'] = ethanol_rates
    trial_rates_df['acetate rate'] = acetate_rates
    trial_rates_df['butanol rate'] = butanol_rates
    trial_rates_df['butyrate rate'] = butyrate_rates

    return trial_rates_df

### Apply function to every trial

In [6]:
rates_df = pd.DataFrame()

# Loop through compositions
for composition in set(smoothed_df.composition):
    
    # For each composition, loop over trials
    composition_df = smoothed_df[smoothed_df.composition == composition]
    for trial in set(composition_df.trial):
        
        # isolate the data for a single trial
        trial_df = smoothed_df[(smoothed_df.composition == composition) & (smoothed_df.trial == trial)]
        
        # get the rates data for that trial
        trial_rates_df = add_rates_to_trial(trial_df, delta)
        
        # add the trial to rates_df
        rates_df = pd.concat([rates_df, trial_rates_df])

rates_df.head(10)

Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM),N2,CO,CO2,H2,flow rate (mL/min),biomass rate,ethanol rate,acetate rate,butanol rate,butyrate rate
0,1.0,1.0,0.0,0.013426,0.0,0.0,0.000492,0.058332,60.0,20.0,15.0,5.0,20.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.1,0.097327,1.917572,1.660755,0.005883,0.066873,60.0,20.0,15.0,5.0,20.0,0.839006,19.175716,16.60755,0.053914,0.085414
2,1.0,1.0,0.2,0.172225,4.989251,4.31316,0.010819,0.074034,60.0,20.0,15.0,5.0,20.0,0.748985,30.716792,26.524049,0.049363,0.071606
3,1.0,1.0,0.3,0.238122,7.685937,6.765444,0.015301,0.079814,60.0,20.0,15.0,5.0,20.0,0.658964,26.966859,24.522838,0.044813,0.057799
4,1.0,1.0,0.4,0.295016,10.007629,9.017607,0.019327,0.084213,60.0,20.0,15.0,5.0,20.0,0.568943,23.216926,22.521627,0.040263,0.043991
5,1.0,1.0,0.5,0.342908,11.954328,11.069648,0.022898,0.087231,60.0,20.0,15.0,5.0,20.0,0.478922,19.466992,20.520417,0.035712,0.030183
6,1.0,1.0,0.6,0.381798,13.526034,12.921569,0.026014,0.088869,60.0,20.0,15.0,5.0,20.0,0.388901,15.717059,18.519206,0.031162,0.016375
7,1.0,1.0,0.7,0.411686,14.722747,14.573368,0.028676,0.089125,60.0,20.0,15.0,5.0,20.0,0.29888,11.967126,16.517995,0.026612,0.002567
8,1.0,1.0,0.8,0.426219,15.684459,16.189576,0.030462,0.086824,60.0,20.0,15.0,5.0,20.0,0.145324,9.617115,16.162079,0.01786,-0.023019
9,1.0,1.0,0.9,0.432018,16.00152,16.742923,0.031439,0.082842,60.0,20.0,15.0,5.0,20.0,0.057994,3.170618,5.533469,0.009774,-0.039819


### Remove data from the first 24 hrs of fermentation (glucose was present)

In [7]:
# remove all time points before one day has passed
rates_df = rates_df[rates_df['time'] >= 1]

# subtract one day from all times
rates_df['time'] = rates_df['time'] - 1

rates_df.head(10)

Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM),N2,CO,CO2,H2,flow rate (mL/min),biomass rate,ethanol rate,acetate rate,butanol rate,butyrate rate
10,1.0,1.0,0.0,0.429529,15.871137,16.89354,0.029863,0.070178,60.0,20.0,15.0,5.0,20.0,-0.02489,-1.303834,1.506172,-0.015756,-0.126633
11,1.0,1.0,0.1,0.421056,15.583697,17.988494,0.028287,0.0561,60.0,20.0,15.0,5.0,20.0,-0.084727,-2.874395,10.949538,-0.015765,-0.140778
12,1.0,1.0,0.2,0.410826,15.121595,19.816681,0.028325,0.046269,60.0,20.0,15.0,5.0,20.0,-0.102302,-4.62102,18.281864,0.00038,-0.098317
13,1.0,1.0,0.3,0.403602,14.500537,22.179671,0.031348,0.04543,60.0,20.0,15.0,5.0,20.0,-0.072244,-6.210585,23.629906,0.030235,-0.008389
14,1.0,1.0,0.4,0.403688,13.935178,25.274862,0.028703,0.031849,60.0,20.0,15.0,5.0,20.0,0.000866,-5.65359,30.951909,-0.026458,-0.135805
15,1.0,1.0,0.5,0.40624,13.302534,28.983542,0.030988,0.037619,60.0,20.0,15.0,5.0,20.0,0.02552,-6.32644,37.086802,0.022851,0.057692
16,1.0,1.0,0.6,0.410874,12.629887,32.875284,0.043472,0.075515,60.0,20.0,15.0,5.0,20.0,0.046335,-6.726471,38.917421,0.124842,0.378959
17,1.0,1.0,0.7,0.416723,12.043783,36.718888,0.070169,0.155912,60.0,20.0,15.0,5.0,20.0,0.058487,-5.861041,38.436037,0.266975,0.803975
18,1.0,1.0,0.8,0.424176,11.479518,40.301776,0.114038,0.287944,60.0,20.0,15.0,5.0,20.0,0.074531,-5.642644,35.828875,0.438688,1.320317
19,1.0,1.0,0.9,0.432878,10.959046,43.408944,0.176694,0.475785,60.0,20.0,15.0,5.0,20.0,0.08702,-5.204725,31.071681,0.626561,1.878416


### Save rates data as csv

In [8]:
rates_df.to_csv(f'../data/rates_data.csv', index=False)