# Documentation
<br>
This notebook is the first step in a workflow that deals with Bradford Protein Quantification 96x well plate reader data:

1. Exploratory data analysis.
2. Cleaning the data to remove absorbance values that are outside the linear range of the instrument
3. Parsing and exporting both the calibrant and sample data into the processed_data_files directory preparatory to downstream analysis

Alex Perkins 16th November 2021
a.j.p.perkins@sms.ed.ac.uk

### Environment Preparation ---------------------------------------------------------------

### Import Packages

In [1]:
import pandas as pd
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import pymc3 as pm
from pymc3 import HalfCauchy, Model, Normal, glm, plot_posterior_predictive_glm, sample

print(f"Running on PyMC3 v{pm.__version__}")

import arviz as az
from scipy.stats import norm

# Import curve fitting package from scipy
from sklearn.linear_model import LinearRegression

# import os
import os, sys, shutil

from experiment_specific_config import *

%matplotlib inline



Running on PyMC3 v3.11.5


In [2]:
num_experiment_replicates = len(dilutions_dict[list(dilutions_dict.keys())[0]])
num_calibrant_replicates = len(calibrants_dict[list(calibrants_dict.keys())[0]])
num_experiment_replicates

3

### This cell just deals with reading the data file in the directory

In [3]:
##################################################################

move_file = False

##################################################################
# define error handler
class UnAcceptedValueError(Exception):   
    def __init__(self, data):    
        self.data = data
    def __str__(self):
        return repr(self.data)

####################################################################
# gets all items in directory
items = os.listdir(".")

# lists all .csv
csv_list = []

try:
    for names in items:
        if names.endswith(".CSV") | names.endswith(".csv"):
            csv_list.append(names)

except:
    print("Couldn't find any csv files")

try:
    if(len(csv_list) > 1):
        raise UnAcceptedValueError("More than 1x .CSV file in the directory");
except UnAcceptedValueError as e:
    print ("Received error:", e.data)
    # kills the process
    quit()
##########################################################################################
print(csv_list)



experiment_name = csv_list[0]


#import dataset as dataframe
raw_data = pd.read_csv(csv_list[0], header=None, error_bad_lines=False)

['134709_221009_OT2_BRADFORD.csv']




  raw_data = pd.read_csv(csv_list[0], header=None, error_bad_lines=False)


In [4]:
bradford_index_list =[]

# iterate over the rows
for i,row in enumerate(raw_data.iloc[:,0]):
    
    #if string
    if isinstance(row, str):
        
        #if first 7 characters spell bradford
        if row[:8] == "Bradford":
            # append the row index to the list
            bradford_index_list.append(i)

In [5]:
bradford_index_list

[0, 11, 22, 33]

In [6]:
parsed_data = pd.DataFrame(columns=["Well", "Absorbance", "Measurement"])

for first_index in bradford_index_list:
    
    last_index = first_index + 10
    
    individual_slice = raw_data.iloc[first_index:last_index, :].reset_index(drop=True)
    

    for i, row in individual_slice.iterrows():
        
        
        if i >= 2 and i < 10:
            
            selected_row = individual_slice.iloc[i, :]
            
            plate_letter_row = selected_row[0]
            
            data = selected_row[1:-1].reset_index()
            
            data["index"] = plate_letter_row + data["index"].astype(str)
            data["measurement"] = selected_row.iloc[-1]
            data = data.set_axis(["Well", "Absorbance", "Measurement"], axis=1, copy=False)
            
            parsed_data = pd.concat([parsed_data, data], ignore_index=True)
  

In [7]:
parsed_data

Unnamed: 0,Well,Absorbance,Measurement
0,A1,0.193,Bradford:977 [Test]
1,A2,0.195,Bradford:977 [Test]
2,A3,0.204,Bradford:977 [Test]
3,A4,,Bradford:977 [Test]
4,A5,,Bradford:977 [Test]
...,...,...,...
379,H8,0.593,Bradford:595
380,H9,0.598,Bradford:595
381,H10,0.599,Bradford:595
382,H11,0.644,Bradford:595


In [8]:
          
#######################################################################

print(os.getcwd())
path = "/src/processed_data_files/"
# make directory for sticking the output in
if os.path.isdir(path) == False:
    os.mkdir(path, mode=0o777)
    
    
# now that the dataset has been read in, move it into the processed_data_files file for neatness.
if 1 == move_file:
    shutil.move(csv_list[0], path)

# navigate into the directory for future processed_data_files storage
os.chdir(path)


parsed_data.to_csv("unfiltered_parsed_data.csv", index=False)

/src


In [9]:
#######################################################################

print(os.getcwd())
path = "/src/output/"
# make directory for sticking the output in
if os.path.isdir(path) == False:
    os.mkdir(path, mode=0o777)
    
    
# now that the dataset has been read in, move it into the output file for neatness.
if 1 == move_file:
    shutil.move(csv_list[0], path)

# navigate into the directory for future plot storage
os.chdir(path)

/src/processed_data_files


### End of Enviroment Setup ---------------------------------------------------------------------------

# Looking at the raw calibrant data

## Step 1: Looking at the raw values

In [10]:
# real concs in wells. Stocks in ug/ml diluted by 20x
calibrant_range = list(calibrants_dict.keys())


calibrant_well_list = []
conc_list = []

for conc, wells in calibrants_dict.items():
    calibrant_well_list.extend(wells)
    conc_list.extend([conc]*num_calibrant_replicates)

calibrant_df = parsed_data[parsed_data["Well"].isin(calibrant_well_list) & parsed_data["Measurement"].isin(["Bradford:595"])]


calibrant_df.loc[:, "Concentration"] = conc_list
calibrant_df = calibrant_df.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calibrant_df.loc[:, "Concentration"] = conc_list


In [11]:
#######################################################################

print(os.getcwd())
path = "/src/processed_data_files/"
# make directory for sticking the output in
if os.path.isdir(path) == False:
    os.mkdir(path, mode=0o777)
    

# navigate into the directory for future processed_data_files storage
os.chdir(path)

calibrant_df.to_csv("parsed_calibrant_data.csv", index=False)

/src/output


In [12]:
replicate_wise = pd.DataFrame()

for conc in calibrant_df["Concentration"].unique():
    
    df_s = calibrant_df[calibrant_df["Concentration"] == conc]
    abs_ = df_s["Absorbance"]
    abs_.index = ["Rep1", "Rep2", "Rep3", "Rep4", "Rep5"]
    replicate_wise = pd.concat([replicate_wise, abs_],axis =1)


replicate_wise = replicate_wise.T
replicate_wise["Concentration"] = calibrant_range

replicate_wise = replicate_wise.reset_index(drop=True)
calibrant_df_reps = replicate_wise

## Step 2: Reorganising Calibrant Data

## Step 3: Filter the calibrants and only keep those within the linear range  (0.75 - 0.45)

## Step: 4 Defining the absorbance of each concentration as a gaussian.

This allows us to store the mean concentration and it's error as a function and calculate the probabilities on demand.

Lets assume that the technical error of the absorbance measurements are normally distributed

1. We calculate the mean and the standard deviation for each absorbance triplicate.
2. We define a gaussian sampling function so we can easily sample and return a granular array.

In [13]:

# calculate the mean of each triplicate
calibrant_df_reps["Mean"] = calibrant_df_reps[["Rep1", "Rep2", "Rep3", "Rep4", "Rep5"]].mean(axis=1)
# calculate the variance of each triplicate
calibrant_df_reps["σ"] = calibrant_df_reps[["Rep1", "Rep2", "Rep3", "Rep4", "Rep5"]].iloc[:,:3].std(axis=1)

print(calibrant_df_reps)

calibrants_df_avg = calibrant_df_reps[["Mean", "σ"]]


def sample_gaussian(mu, sigma):
    
    # define the x range: mean - 4*sigma and mean + 4*sigma. increments = mean/1000
    x = np.arange((mu-(4*sigma)),(mu+(4*sigma)), mu/1000)
    
    # use the norm.pdf (probability density function) to sample and return the array.
    return norm.pdf(x, mu, sigma)

# do it for each calibrant. Not currently stored.
for idx, row in calibrants_df_avg.iterrows():
    sample_gaussian(calibrant_df_reps.loc[idx]["Mean"], calibrant_df_reps.loc[idx]["σ"])


    Rep1   Rep2   Rep3   Rep4   Rep5 Concentration    Mean         σ
0  0.459  0.446  0.447  0.466  0.456             0  0.4548  0.007234
1  0.473  0.468  0.475  0.486  0.484            50  0.4772  0.003606
2  0.511  0.506  0.506  0.522  0.514            75  0.5118  0.002887
3  0.521  0.514  0.508  0.534   0.53           100  0.5214  0.006506
4  0.536  0.529  0.515  0.531  0.535           125  0.5292  0.010693
5  0.553  0.556   0.56  0.577  0.561           150  0.5614  0.003512
6  0.568  0.565  0.564  0.576  0.578           175  0.5702  0.002082
7  0.593  0.598  0.599  0.644   0.64           200  0.6148  0.003215


Having got our function, lets plot all the gaussians together and have a wee look.

# Sample Data

# Step 1: Looking at the raw sample data

In [14]:
# real concs in wells. Stocks in ug/ml diluted by 20x
dilutions_range = list(dilutions_dict[list(dilutions_dict.keys())[0]].keys())
print(dilutions_range)

# extract nested dict for first protein mix

for i, name in enumerate(list(dilutions_dict.keys())):
    
    protein_mix = dilutions_dict[list(dilutions_dict.keys())[i]]
    protein_name = list(dilutions_dict.keys())[i]
    print(name)

    dilutions_well_list = []
    dilutions_list = []

    for dilution, wells in protein_mix.items():
        dilutions_well_list.extend(wells)
        dilutions_list.extend([dilution]*num_experiment_replicates)

    dilutions_df = parsed_data[parsed_data["Well"].isin(dilutions_well_list) & parsed_data["Measurement"].isin(["Bradford:595"])]


    dilutions_df.loc[:, "DilutionX"] = dilutions_list
    dilutions_df.loc[:, "ProteinMix"] = protein_name

    dilutions_df = dilutions_df.reset_index(drop=True)
dilutions_df

['20', '40', '60']
OPP


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dilutions_df.loc[:, "DilutionX"] = dilutions_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dilutions_df.loc[:, "ProteinMix"] = protein_name


Unnamed: 0,Well,Absorbance,Measurement,DilutionX,ProteinMix
0,A1,0.426,Bradford:595,20,OPP
1,A2,0.435,Bradford:595,20,OPP
2,A3,0.447,Bradford:595,20,OPP
3,B1,0.447,Bradford:595,40,OPP
4,B2,0.453,Bradford:595,40,OPP
5,B3,0.442,Bradford:595,40,OPP
6,C1,0.45,Bradford:595,60,OPP
7,C2,0.459,Bradford:595,60,OPP
8,C3,0.452,Bradford:595,60,OPP


In [15]:
#######################################################################

print(os.getcwd())
path = "/src/processed_data_files/"
# make directory for sticking the output in
if os.path.isdir(path) == False:
    os.mkdir(path, mode=0o777)
    

# navigate into the directory for future processed_data_files storage
os.chdir(path)

dilutions_df.to_csv("tidy_sample_data.csv", index=False)

/src/processed_data_files


In [16]:
replicate_wise = pd.DataFrame()

for conc in dilutions_df["DilutionX"].unique():
    
    df_s = dilutions_df[dilutions_df["DilutionX"] == conc]
    abs_ = df_s["Absorbance"]
    abs_.index = ["Rep1", "Rep2", "Rep3"]
    replicate_wise = pd.concat([replicate_wise, abs_],axis =1)

replicate_wise = replicate_wise.T
print(replicate_wise)
print(dilutions_range)
replicate_wise["DilutionX"] = dilutions_range

replicate_wise = replicate_wise.reset_index(drop=True)
dilutions_df_reps = replicate_wise

             Rep1   Rep2   Rep3
Absorbance  0.426  0.435  0.447
Absorbance  0.447  0.453  0.442
Absorbance   0.45  0.459  0.452
['20', '40', '60']


In [17]:
dilutions_df_reps

Unnamed: 0,Rep1,Rep2,Rep3,DilutionX
0,0.426,0.435,0.447,20
1,0.447,0.453,0.442,40
2,0.45,0.459,0.452,60


In [18]:

# calculate the mean of each triplicate
dilutions_df_reps["Mean"] = dilutions_df_reps[["Rep1", "Rep2", "Rep3"]].mean(axis=1)
# calculate the variance of each triplicate
dilutions_df_reps["σ"] = dilutions_df_reps[["Rep1", "Rep2", "Rep3"]].iloc[:,:3].std(axis=1)

print(dilutions_df_reps)

dilutions_df_avg = dilutions_df_reps[["Mean", "σ"]]


def sample_gaussian(mu, sigma):
    
    # define the x range: mean - 4*sigma and mean + 4*sigma. increments = mean/1000
    x = np.arange((mu-(4*sigma)),(mu+(4*sigma)), mu/1000)
    
    # use the norm.pdf (probability density function) to sample and return the array.
    return norm.pdf(x, mu, sigma)

# do it for each calibrant. Not currently stored.
for idx, row in dilutions_df_avg.iterrows():
    sample_gaussian(dilutions_df_reps.loc[idx]["Mean"], dilutions_df_reps.loc[idx]["σ"])


    Rep1   Rep2   Rep3 DilutionX      Mean         σ
0  0.426  0.435  0.447        20  0.436000  0.010536
1  0.447  0.453  0.442        40  0.447333  0.005508
2   0.45  0.459  0.452        60  0.453667  0.004726


# Make directory for sticking the heat maps into

In [19]:
print(os.getcwd())

path = "/src/output/sampleheatmaps/"

# make directory for sticking the output in
if os.path.isdir(path) == False:
    os.mkdir(path, mode=0o777)
    
   

/src/processed_data_files


# Exporting the parsed calibrant and sample data

## Step 1: Check if '/processed_data_files' exists. If not, create it and navigate inside.

In [20]:
print(os.getcwd())
path_processed_data = "/src/processed_data_files/"
# make directory for sticking the processed data in
if os.path.isdir(path_processed_data) == False:
    os.mkdir(path_processed_data, mode=0o777)

# navigate into the path_processed_data directory for data storage
os.chdir(path_processed_data)
print(os.getcwd())


/src/processed_data_files
/src/processed_data_files


## Step 2: Export both calibrant and sample data into processed_data_files as CSV files.

In [21]:
# navigate back into the root directory for neatness
os.chdir("/src/")

## 