# ETL: Converting Parameters in Ratelaw file to standalone Table

Parameters in the Ratelaws file are mishandled and because of invalid table structures (according to Relational Table Normalization Rules), we have to use a REGEX parsing mechanism to find and set-values for parameters. 

To alleviate these issues, placing parameters, bound to a reaction Id, in PEtab format, would greatly alleviate this issue. 

### First: Package import and file loading

In [1]:
import os
import re
import pandas as pd

ratelaws_df = pd.read_csv('../data/Ratelaws_updated_names.tsv', sep='\t', header=0)

### Second: Construct New Table By Index Scanning Ratelaws table, update params_table

Formatted with PEtab V1 headers. 

In [2]:
params_table = pd.DataFrame()

for i, row in ratelaws_df.iterrows():

    reactionId = row.iloc[0]
    
    paramids = [f'parameter{i}' for i in range(1, 9)]

    # Only get parameter names that are not NaN
    valid_params = [p for p in paramids if pd.notna(row[p])]

    try: 
        float(row.loc['ratelaw'])
        
        new_param_name = "k" + str(reactionId)[1:] + '_' + str(1)
        value = row.loc['ratelaw']
        new_params_row = {
        'reactionId': str(reactionId), 
        'parameterId': new_param_name, 
        'parameterScale': 'lin', 
        'lowerBound': float(value), 
        'upperBound': float(value), 
        'nominalValue': float(value), 
        'estimate': int(0)
        }

        params_table = pd.concat([params_table, pd.DataFrame([new_params_row])])
        continue

    except:
        for j, column_name in enumerate(valid_params):
        
            # All SPARCED reactions are labeled with 'v' as the initial character, removed via [1:]
            new_param_name = "k" + str(reactionId)[1:] + '_' + str(j+1)

            value = row[column_name]
            
            new_params_row = {
            'reactionId': str(reactionId), 
            'parameterId': new_param_name, 
            'parameterScale': 'lin', 
            'lowerBound': float(value), 
            'upperBound': float(value), 
            'nominalValue': float(value), 
            'estimate': int(0)
            }

            params_table = pd.concat([params_table, pd.DataFrame([new_params_row])])


In [3]:
params_table

Unnamed: 0,reactionId,parameterId,parameterScale,lowerBound,upperBound,nominalValue,estimate
0,vbR,kbR_1,lin,0.004200,0.004200,0.004200,0
0,vbR,kbR_2,lin,0.040000,0.040000,0.040000,0
0,vbR,kbR_3,lin,5.000000,5.000000,5.000000,0
0,vbR,kbR_4,lin,4.860000,4.860000,4.860000,0
0,vdR,kdR_1,lin,0.000002,0.000002,0.000002,0
...,...,...,...,...,...,...,...
0,vTCd145,kTCd145_1,lin,0.000074,0.000074,0.000074,0
0,vTCd146,kTCd146_1,lin,0.000074,0.000074,0.000074,0
0,vTCd147,kTCd147_1,lin,0.000074,0.000074,0.000074,0
0,vTCd148,kTCd148_1,lin,0.000074,0.000074,0.000074,0


### Third: Inspect for duplicates and formatting errors:

In [4]:
params_table.drop_duplicates()

Unnamed: 0,reactionId,parameterId,parameterScale,lowerBound,upperBound,nominalValue,estimate
0,vbR,kbR_1,lin,0.004200,0.004200,0.004200,0
0,vbR,kbR_2,lin,0.040000,0.040000,0.040000,0
0,vbR,kbR_3,lin,5.000000,5.000000,5.000000,0
0,vbR,kbR_4,lin,4.860000,4.860000,4.860000,0
0,vdR,kdR_1,lin,0.000002,0.000002,0.000002,0
...,...,...,...,...,...,...,...
0,vTCd145,kTCd145_1,lin,0.000074,0.000074,0.000074,0
0,vTCd146,kTCd146_1,lin,0.000074,0.000074,0.000074,0
0,vTCd147,kTCd147_1,lin,0.000074,0.000074,0.000074,0
0,vTCd148,kTCd148_1,lin,0.000074,0.000074,0.000074,0


### Fourth: Save to new file

In [5]:
params_table.to_csv('../data/Parameters.tsv', index=False, sep='\t')

### Fifth: Remove parameters columns from Ratelaws and save

In [6]:
ratelaws_df = ratelaws_df.drop(columns=['parameter1', 'parameter2', 'parameter3', 'parameter4',
                                        'parameter5', 'parameter6', 'parameter7', 'parameter8'])


### Sixth: Add ratelaws for first order reactions:

In [None]:
for i, row in ratelaws_df.iterrows():

    reactionId = row.iloc[0]
    # print('reactionId: ', reactionId)

    ratelaw = row.loc['ratelaw']

    try: 
        
        rand_var = float(ratelaw)
            
        # print('ratelaw is pure float:', ratelaw)

        formula = "k" + str(reactionId)[1:] + '_' + str(1)

        rxn = row.get('r ; p', '')

        # print('Reaction Parts: ', rxn)

        reactants_str, products_str = (rxn.split(';') + [''])[:2]

        reactants = [r.strip() for r in reactants_str.split('+') if r.strip()]

        products = [p.strip() for p in products_str.split('+') if p.strip()]

        if reactants != []:
            for reactant in reactants:
                formula = formula + str(f'*{reactant}')

        ratelaws_df.at[i, 'ratelaw'] = formula

    except ValueError:
        # print('cell has string formula:', ratelaw)
        continue

reactionId                         vdR
solver                   Deterministic
compartment                  Cytoplasm
r ; p              cyt_imp_Ribosome_ ;
ratelaw        kdR_1*cyt_imp_Ribosome_
Name: 1, dtype: object
reactionId                        vTLCd1
solver                     Deterministic
compartment                      Nucleus
r ; p                 nuc_prot_i_TP53_ ;
ratelaw        kTLCd1_1*nuc_prot_i_TP53_
Name: 143, dtype: object
reactionId                      vTLCd2
solver                   Deterministic
compartment                    Nucleus
r ; p                 nuc_prot_MDM2_ ;
ratelaw        kTLCd2_1*nuc_prot_MDM2_
Name: 144, dtype: object
reactionId                       vTLCd3
solver                    Deterministic
compartment                     Nucleus
r ; p                 nuc_prot_PPM1D_ ;
ratelaw        kTLCd3_1*nuc_prot_PPM1D_
Name: 145, dtype: object
reactionId                       vTLCd4
solver                    Deterministic
compartment                

In [9]:
ratelaws_df.to_csv('../data/Ratelaws.tsv', index = False, sep = '\t')