# Calculating mu_max values from Biolector Runs

- All Biolector data must be saved, converted to EDD-compatible file using S. Tan's Collab notebook, then linked to a given EDD study

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_datareader as pdr
import math
import csv

# Import EDD utils to import study as pandas dataframe
from edd_utils import login, export_study

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from datetime import datetime
import seaborn as sns; sns.set(color_codes=True)
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import HBox, Label, Layout ,Button,AppLayout, jslink, IntText, IntSlider
import matplotlib.colors as mcolors
import plotly.offline as pyo
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly
%matplotlib inline

#### Import the study from EDD using credentials

In [3]:
# Create a function which is for entering the URL of EDD study; Taken from EDD Tools
def edd_study_url():
     
#     colors:
    class bcolors:
        HEADER = '\033[95m'
        FAIL = '\033[91m'
        BOLD = '\033[1m'
        
    while True:
        try:
            user_edd_study_url = input("Please enter EDD STUDY URL: ").lower()
            user_edd_study_url = user_edd_study_url.replace(" ","")
            if "/s/" not in user_edd_study_url:
                raise ValueError()
            else:
                break
        except ValueError:
            print(f"{bcolors.HEADER}{bcolors.BOLD}PLEASE TRY AGAIN")
            
            
    url_parts = user_edd_study_url.split("/")    
    s_index = url_parts.index("s")
    edd_server = url_parts[s_index-1]
    study_slug = url_parts[s_index+1]
    final_url_parts = [edd_server, study_slug]

    return final_url_parts

In [4]:
# Create a function which is for entering the URL of EDD study; Taken from EDD Tools
def edd_user_name(
    )->str:

    while True:
        try:
            user_name = input("Please enter EDD User Name: ").lower()
            user_name = user_name.replace(" ","")
            break
        except ValueError:
            print(f"{bcolors.HEADER}{bcolors.BOLD}PLEASE TRY AGAIN")
        
    return user_name

In [5]:
#User input requested
final_url_parts = edd_study_url()

#Media DBTL0: https://edd.jbei.org/s/p-putida-crispri-media-optimization-dbtl0-2ea4/
#KOs P1+P2: https://edd.jbei.org/s/crispri-automation-for-enhanced-isoprenol-pro-e817/
#DBTL0 https://edd.jbei.org/s/crispri-automation-for-enhanced-isoprenol-pro-096d/


Please enter EDD STUDY URL:  https://edd.jbei.org/s/crispri-automation-for-enhanced-isoprenol-pro-096d/


In [6]:
# Parse EDD STUDY URL for edd_utils
edd_server = final_url_parts[0]
study_slug = final_url_parts[1]

user_name = edd_user_name()

Please enter EDD User Name:  carruthers


In [7]:
# Create EDD session
session = login(edd_server=edd_server, user=user_name)

Password for carruthers:  ········


In [8]:
df = export_study(session, study_slug, edd_server=edd_server)

  0%|          | 0/1033140 [00:00<?, ?it/s]

In [9]:
study_name = df["Study Name"][0]
study_name = re.sub('\W+','_', study_name )
study_name
df2=df.copy()

In [10]:
# Including 'Protocol' and 'Units'
excluded_columns = ['Study ID', 'Study Name', 'Study Description', 'Study Contact',
       'Line ID', 'Control', 'Strain(s)',
       'Carbon Source(s)', 'Line Experimenter', 'Line Contact', 'Media',
       'Protocol ID', 'Protocol Name', 'Assay ID', 
       'Measurement Updated','Compartment','Replicate Key','Formal Type']

#'Line Description'

In [11]:
def parse_edd_study(df):
    df = df.loc[:,~df.columns.isin(excluded_columns)]
    return df

df = parse_edd_study(df)
df.tail()

Unnamed: 0,Line Name,Line Description,Protocol,Assay Name,Measurement Type,Units,Value,Hours
1033135,PP_5416-R2,BL10B5_IY2022,Global Proteomics,PP_5416-R2,Probable cysteine desulfurase,counts,1051686.97,48.0
1033136,PP_5416-R3,BL10C5_IY2022,Global Proteomics,PP_5416-R3,Probable cysteine desulfurase,counts,992141.1,48.0
1033137,PP_5420-R1,BL10A4_IY2019,Global Proteomics,PP_5420-R1,Probable cysteine desulfurase,counts,594914.52,48.0
1033138,PP_5420-R2,BL10B4_IY2019,Global Proteomics,PP_5420-R2,Probable cysteine desulfurase,counts,386573.66,48.0
1033139,PP_5420-R3,BL10C4_IY2019,Global Proteomics,PP_5420-R3,Probable cysteine desulfurase,counts,1419732.2,48.0


In [16]:
protocol_type = 'OD600'  # replace with the actual assay name you're filtering for

# Filter and reorder the columns
df_filtered = df.loc[df['Protocol'] == protocol_type, ['Line Name', 'Measurement Type', 'Hours','Value', 'Units']]
df_renamed = df_filtered.rename(columns={
    "Hours":"Time"})
df_renamed.reset_index(drop=True, inplace=True)
df_renamed.head()

Unnamed: 0,Line Name,Measurement Type,Time,Value,Units


### Perform analysis with a rolling window of 60 time points

In [13]:
FILE='DBTL0/DBTL0.1.csv'
df1=df_renamed.copy()

Line_Name = {}
data = []
strains = df1['Line Name'].unique()
size=len(strains)

fig, axs = plt.subplots(int(size/12), 12, figsize=(21, 14),sharey=True, sharex=True) 

nrow = math.ceil(size/4)
ncol = 4

for strain in strains:
    Line_Name[strain]=df1[df1['Line Name']==strain]
    
for k, strain in enumerate(strains,start=1):
    nlog=np.log(Line_Name[strain]['Value'])
    time=Line_Name[strain]['Time']

    df1 = pd.concat([time, nlog], axis=1)
    time_range2=np.arange(0,len(df1['Time']))*5
    
    slopes=[]
    r2s=[]
    pvals=[]
    intercepts=[]
    rolling_window = 60
    
    # Perform rolling linear regression
    for i in range(len(df1) - rolling_window + 1):
        subset = df1.iloc[i:i+rolling_window]
        X = sm.add_constant(subset['Time'])  # Add a constant term for the intercept
        y = subset['Value']
        model = sm.OLS(y, X).fit()
        
        slope = model.params['Time']  # Get the slope coefficient
        slopes.append(slope)
        intercept = model.params['const'] # Get the intercept
        intercepts.append(intercept)
        r2=model.rsquared
        r2s.append(r2)
        pval=model.pvalues
        pvals.append(pval)

    max1=max(slopes)
    umax_pos=(slopes.index(max(slopes)))
    pval=pvals[umax_pos]
    r2=r2s[umax_pos]
    intercept=intercepts[umax_pos]
    y_pred = max1*time+intercept
    
    result=(strain, max1, r2)
    print(result)

    data.append(result)
    
    ax=axs.flatten()[k-1]
    ax.set_title(strain, size=10)
    ax.plot(time_range2, nlog)
    ax.plot(time_range2, y_pred)
    ax.set_ylim([0, 4])
    plt.tight_layout()

ValueError: Number of rows must be a positive integer, not 0

<Figure size 2100x1400 with 0 Axes>

In [None]:
df_output = pd.DataFrame(data)

# Filename for the output CSV file
filename = "240903_KOs_P1+P2.csv"

# Save DataFrame to CSV
df_output.to_csv(filename, index=False)

print(f"Data has been written to {filename}")

In [None]:
df1=df_renamed.copy()
Line_Name = {}
data = []
strains = df1['Line Name'].unique()
size=len(strains)

fig, axs = plt.subplots(int(size/12), 12, figsize=(21, 14),sharey=True, sharex=True) 

nrow = math.ceil(size/4)
ncol = 4
for strain in strains:
    Line_Name[strain]=df1[df1['Line Name']==strain]

for k, strain in enumerate(strains,start=1):
    nlog=np.log(Line_Name[strain]['Value'])
    time=Line_Name[strain]['Time']

    df1 = pd.concat([time, nlog], axis=1)
    time_range2=np.arange(0,len(df1['Time']))
    
    slopes=[]
    r2s=[]
    pvals=[]
    intercepts=[]
    rolling_window = 40
    
    # Perform rolling linear regression
    for i in range(len(df1) - rolling_window + 1):
        subset = df1.iloc[i:i+rolling_window]
        X = sm.add_constant(subset['Time'])  # Add a constant term for the intercept
        y = subset['Value']
        model = sm.OLS(y, X).fit()
        
        slope = model.params['Time']  # Get the slope coefficient
        slopes.append(slope)
        intercept = model.params['const'] # Get the intercept
        intercepts.append(intercept)
        r2=model.rsquared
        r2s.append(r2)
        pval=model.pvalues
        pvals.append(pval)

    max1=max(slopes)
    umax_pos=(slopes.index(max(slopes)))
    pval=pvals[umax_pos]
    r2=r2s[umax_pos]
    intercept=intercepts[umax_pos]
    y_pred = max1*time+intercept

    ax=axs.flatten()[k-1]
    ax.set_title(strain, size=10)
    ax.plot(r2s)
    plt.tight_layout()

In [None]:
df1=df_renamed.copy()
Line_Name = {}
data = []
strains = df1['Line Name'].unique()
size=len(strains)

fig, axs = plt.subplots(int(size/12), 12, figsize=(21, 14),sharey=True, sharex=True) 

for strain in strains:
    Line_Name[strain]=df1[df1['Line Name']==strain]

for k, strain in enumerate(strains,start=1):
    nlog=np.log(Line_Name[strain]['Value'])
    time=Line_Name[strain]['Time']

    df1 = pd.concat([time, nlog], axis=1)
    time_range2=np.arange(0,len(df1['Time']))
    
    slopes=[]
    r2s=[]
    pvals=[]
    intercepts=[]
    rolling_window = 40
    
    # Perform rolling linear regression
    for i in range(len(df1) - rolling_window + 1):
        subset = df1.iloc[i:i+rolling_window]
        X = sm.add_constant(subset['Time'])  # Add a constant term for the intercept
        y = subset['Value']
        model = sm.OLS(y, X).fit()
        
        slope = model.params['Time']  # Get the slope coefficient
        slopes.append(slope)
        intercept = model.params['const'] # Get the intercept
        intercepts.append(intercept)
        r2=model.rsquared
        r2s.append(r2)
        pval=model.pvalues
        pvals.append(pval)

    max1=max(slopes)
    umax_pos=(slopes.index(max(slopes)))
    pval=pvals[umax_pos]
    r2=r2s[umax_pos]
    intercept=intercepts[umax_pos]
    y_pred = max1*time+intercept

    ax=axs.flatten()[k-1]
    ax.set_title(strain, size=10)
    ax.plot(slopes)
    plt.tight_layout()