<a href="https://colab.research.google.com/github/justcme/PerformanceSpecs/blob/main/E_NewCriteriaDataWrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lipid Panel Performance Recommendations

### Data Manipulation - New Criteria

Added proportional bias as per scenarios and created new columns with these biased lipid values.

Added columns for coefficients for PCE calculation, calculated the fixed terms and summed them.

# SETUP

OS

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive/')
os.chdir("/content/drive/My Drive/Colab Notebooks/LipidPerf/from_Pandas/Base")

Mounted at /content/drive/


Imports

In [None]:
import numpy as np
import pandas as pd
from scipy.stats.morestats import sqrt
from scipy.stats import norm
import random

# DATA

In [None]:
df8000 = pd.read_csv("df8000.csv")  #Lipid Data
PCE_coeffs = pd.read_csv("PCE_coeffs.csv")   #Coefficients lookup table
TAE_df = pd.read_csv('TAEnew_df.csv', index_col=0)   #New allowable error and BV lookup table

# FIXED DATA

Create New Dataframe

In [None]:
#Create DataFrame
df_base = df8000.copy()

Create Global Variables

In [None]:
#Lipid Variables
HDL_0 = df_base['HDL']
TC_0 = df_base['TC']
TG_0 = df_base['TG']

#PCE Variables
age = df_base['age']
SBP = df_base['SBP']
BPRx = df_base['BPRx']
BPRxn = df_base['BPRxn']
smoker = df_base['smoker']
DM = df_base['DM']

#Error Variable Names
HDL_BV = TAE_df.loc['HDL','BV']
HDL_CV = TAE_df.loc['HDL','CV']
HDL_SD = TAE_df.loc['HDL','SD']
HDL_PB = TAE_df.loc['HDL','PB']

TC_BV = TAE_df.loc['TC','BV']
TC_CV = TAE_df.loc['TC','CV']
TC_PB = TAE_df.loc['TC','PB']

TG_BV = TAE_df.loc['TG','BV']
TG_CV = TAE_df.loc['TG','CV']
TG_PB = TAE_df.loc['TG','PB']

Create HDL_CV and PB Columns for each Scenario

In [None]:
                                                                    #### Scenario 0: No Bias

#Create HDL_CV Column (SD=1.7 if HDL <57 mg/dL, CV=0.03 if HDL >=57 mg/dL)
df_base.loc[:,'HDL_CV0'] = np.where(HDL_0 >=57, HDL_CV, HDL_SD/HDL_0)



                                                                    #### Scenario 1: Max LDL-C

#Add Proportional Bias (y = mx +c)
df_base.loc[:,'HDL_PB1'] = ((1-HDL_PB)*HDL_0).astype('int')
df_base.loc[:,'TC_PB1'] = ((1+TC_PB)*TC_0).astype('int')
df_base.loc[:,'TG_PB1'] = ((1-TG_PB)*TG_0).astype('int')
HDL_PB1 = df_base['HDL_PB1']


#Create HDL_CV Column (SD=1.7 if HDL <57 mg/dL, CV=0.03 if HDL >=57 mg/dL)
#Used the biased value as the basis for whether to use SD or CV
df_base.loc[:,'HDL_CV1'] = np.where(HDL_PB1 >=57, HDL_CV, HDL_SD/HDL_PB1)



                                                                    #### Scenario 2: Min LDL-C
#Add Proportional Bias (y = mx +c)
df_base.loc[:,'HDL_PB2'] = ((1+HDL_PB)*HDL_0).astype('int')
df_base.loc[:,'TC_PB2'] = ((1-TC_PB)*TC_0).astype('int')
df_base.loc[:,'TG_PB2'] = ((1+TG_PB)*TG_0).astype('int')
HDL_PB2 = df_base['HDL_PB2']


#Create HDL_CV Column (SD=1.7 if HDL <57 mg/dL, CV=0.03 if HDL >=57 mg/dL)
#Used the biased value as the basis for whether to use SD or CV
df_base.loc[:,'HDL_CV2'] = np.where(HDL_PB2 >=57, HDL_CV, HDL_SD/HDL_PB2)


Calculate fixed terms for PCE calculation

In [None]:
###Create PCE Variable and Coefficient columns

coeff_names = ['CAge', 'CSqAge', 'CTotalChol', 'CAgeTotalChol','CHDLChol', 'CAgeHDLChol', 'COnHypertensionMeds', 'CAgeOnHypertensionMeds', 'COffHypertensionMeds', 'CAgeOffHypertensionMeds',
          'CSmoker', 'CAgeSmoker', 'CDiabetes', 'S10','MeanTerms']

fixedcoeffs = ['CAge', 'CSqAge', 'COnHypertensionMeds', 'CAgeOnHypertensionMeds', 'COffHypertensionMeds',
               'CAgeOffHypertensionMeds', 'CSmoker', 'CAgeSmoker', 'CDiabetes', 'S10','MeanTerms']

fixedterms = ['Age', 'SqAge',  'OnHypertensionMeds', 'AgeOnHypertensionMeds', 'OffHypertensionMeds',
              'AgeOffHypertensionMeds', 'Smoker', 'AgeSmoker', 'Diabetes']

#Coefficient Columns
for coeff in coeff_names:
  df_base.loc[:,coeff] = np.select(condlist=[(df_base['race']==0) & (df_base['sex']==0), (df_base['race']==0) & (df_base['sex']==1),
                                       (df_base['race']==1) & (df_base['sex']==0), (df_base['race']==1) & (df_base['sex']==1)],
                             choicelist=[PCE_coeffs.loc[2, coeff], PCE_coeffs.loc[3, coeff], PCE_coeffs.loc[0, coeff], PCE_coeffs.loc[1, coeff]],
                             default = np.nan)


variables = [np.log(age), np.log(age)**2, np.log(SBP) * BPRx, np.log(age) * np.log(SBP) * BPRx,
             np.log(SBP) * BPRxn, np.log(age) * np.log(SBP) * BPRxn, smoker, np.log(age) * smoker, DM]


#Fixed term columns and sum of fixed terms
for t in range(9):
  df_base.loc[:,fixedterms[t]] = variables[t]*df_base[fixedcoeffs[t]]
df_base.loc[:,'interum_sum'] = df_base.loc[:,fixedterms].sum(axis=1)


df_base.head()

Unnamed: 0,ID,age,race,sex,SBP,BPRx,BPRxn,smoker,DM,HDL,...,Age,SqAge,OnHypertensionMeds,AgeOnHypertensionMeds,OffHypertensionMeds,AgeOffHypertensionMeds,Smoker,AgeSmoker,Diabetes,interum_sum
0,31131,44,1,1,139,1,0,0,0,39,...,64.762621,0.0,144.535676,-120.10464,0.0,-0.0,0.0,0.0,0.0,89.193657
1,31153,44,0,1,121,1,0,0,0,50,...,-112.765067,69.939325,9.682701,0.0,0.0,0.0,0.0,-0.0,0.0,-33.14304
2,31158,71,0,0,146,0,1,1,0,71,...,52.61852,0.0,0.0,0.0,8.791082,0.0,7.837,-7.65151,0.0,61.595092
3,31193,51,0,0,149,0,1,1,0,40,...,48.534456,0.0,0.0,0.0,8.826961,0.0,7.837,-7.057627,0.0,58.14079
4,31205,43,0,1,137,0,1,0,0,43,...,-112.080002,69.092123,0.0,0.0,9.628403,0.0,0.0,-0.0,0.0,-33.359477


# Save base dataframe to .CSV file

In [None]:
df_base.to_csv('df_base_new.csv', index = False)