# **Lab Results Notebook**

# **Azure notebook Setup**

In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
#['data_team3_synthetic_test']
datastore = workspace.datastores['data_team3_synthetic_test'] 

In [3]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'lab_results_test.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [4]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
lab_train_data = dataset.to_pandas_dataframe()

In [5]:
lab_train_data

Unnamed: 0,Column1,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State
0,0,51658,85.044326,2012-02-26 16:49:42,ly#_k/ul_blood,LY#,1.316318,,1.0 - 3.7,K/uL,blood,Florida
1,1,51658,85.044326,2012-02-26 16:49:42,eo%_%_blood,EO%,1.992369,,0.4 - 7.6,%,blood,Florida
2,2,51658,85.044326,2012-02-26 16:49:42,mch_pg_blood,MCH,30.505044,,25.5 - 33.6,pg,blood,Florida
3,3,51658,85.044326,2012-02-26 16:49:42,mchc_g/dl_no specimen,Mchc,33.261005,,30.9 - 35.1,g/dL,,Florida
4,4,51658,85.044326,2012-02-26 16:49:42,wbc_k/ul_blood,WBC,7.170170,,4.0 - 10.6,K/uL,blood,Florida
...,...,...,...,...,...,...,...,...,...,...,...,...
68356045,328853762,51661,55.489201,2021-09-27 07:16:26,urea nitrogen_mg/dl_plasma,UREA NITROGEN,42.000000,,7.0 - 21,mg/dL,plasma,Michigan
68356046,328853763,51661,55.489201,2021-09-27 07:16:26,potassium_mmol/l_plasma,POTASSIUM,5.136107,,3.5 - 4.7,mmol/L,plasma,Michigan
68356047,328853764,51661,55.489201,2021-09-27 07:16:26,creatinine_mg/dl_plasma,CREATININE,2.339546,,.67 - 1.17,mg/dL,plasma,Michigan
68356048,328853765,51661,55.489201,2021-09-27 07:16:26,anion gap_null_no specimen,Anion gap,9.000000,,10 - 20,,,Michigan


# Importing the library

In [7]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

# Set the maximum number of rows to be displayed to 1000
pd.set_option('display.max_rows', 1163)

# Elimination of the unnamed columns

In [8]:
# Remove unwanted columns 
lab_train_data.drop('Column1',axis=1,inplace=True)

In [9]:
lab_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68356050 entries, 0 to 68356049
Data columns (total 11 columns):
 #   Column                Dtype         
---  ------                -----         
 0   Internalpatientid     int64         
 1   Age at lab test       float64       
 2   Lab test date         datetime64[ns]
 3   Lab test              object        
 4   Lab test description  object        
 5   Result numeric        float64       
 6   Result textual        object        
 7   Result range          object        
 8   Result units          object        
 9   Specimen source       object        
 10  State                 object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(7)
memory usage: 5.6+ GB


# Sorting out patients id and age column

In [10]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
lab_train_data.sort_values(["Internalpatientid","Age at lab test"],inplace=True)

In [11]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State
6739711,6,74.84225,2001-07-10 20:13:05,baso #_k/cumm_no specimen,BASO #,0.097043,,0 - .5,k/cumm,,Texas
6739712,6,74.84225,2001-07-10 20:13:05,hct_%_blood,Hct,39.104958,,42 - 52,%,blood,Texas
6739713,6,74.84225,2001-07-10 20:13:05,rbc_m/cmm_blood,RBC,4.538197,,4.7 - 6.1,M/cmm,blood,Texas
6739714,6,74.84225,2001-07-10 20:13:05,eos #_k/cumm_no specimen,EOS #,0.196141,,0 - 7,K/cumm,,Texas
6739715,6,74.84225,2001-07-10 20:13:05,neut %_%_no specimen,Neut %,60.180934,,37 - 73,%,,Texas
...,...,...,...,...,...,...,...,...,...,...,...
51731557,169065,53.31840,2011-06-10 21:00:09,po2(t) (rapidlab)_mmhg_no specimen,pO2(T) (RAPIDLAB),106.638117,,70.0 - 90.0,mmHg,,Arizona
51731558,169065,53.31840,2011-06-10 21:00:09,glucose (rapidlab)_mg/dl_no specimen,GLUCOSE (RAPIDLAB),109.000000,,60 - 105,mg/dL,,Arizona
51731559,169065,53.31840,2011-06-10 21:00:09,ph (rapidlab)_null_no specimen,pH (RAPIDLAB),7.413492,,7.35 - 7.45,,,Arizona
51731560,169065,53.31840,2011-06-10 21:00:09,fio2 (rapidlab)_%_no specimen,FiO2 (RAPIDLAB),50.000000,,,%,,Arizona


# Round off the "Result numeric" columns with 2 digits

In [12]:
lab_train_data['Result numeric'] = lab_train_data['Result numeric'].apply(lambda x: round(x,2))

In [13]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State
6739711,6,74.84225,2001-07-10 20:13:05,baso #_k/cumm_no specimen,BASO #,0.10,,0 - .5,k/cumm,,Texas
6739712,6,74.84225,2001-07-10 20:13:05,hct_%_blood,Hct,39.10,,42 - 52,%,blood,Texas
6739713,6,74.84225,2001-07-10 20:13:05,rbc_m/cmm_blood,RBC,4.54,,4.7 - 6.1,M/cmm,blood,Texas
6739714,6,74.84225,2001-07-10 20:13:05,eos #_k/cumm_no specimen,EOS #,0.20,,0 - 7,K/cumm,,Texas
6739715,6,74.84225,2001-07-10 20:13:05,neut %_%_no specimen,Neut %,60.18,,37 - 73,%,,Texas
...,...,...,...,...,...,...,...,...,...,...,...
51731557,169065,53.31840,2011-06-10 21:00:09,po2(t) (rapidlab)_mmhg_no specimen,pO2(T) (RAPIDLAB),106.64,,70.0 - 90.0,mmHg,,Arizona
51731558,169065,53.31840,2011-06-10 21:00:09,glucose (rapidlab)_mg/dl_no specimen,GLUCOSE (RAPIDLAB),109.00,,60 - 105,mg/dL,,Arizona
51731559,169065,53.31840,2011-06-10 21:00:09,ph (rapidlab)_null_no specimen,pH (RAPIDLAB),7.41,,7.35 - 7.45,,,Arizona
51731560,169065,53.31840,2011-06-10 21:00:09,fio2 (rapidlab)_%_no specimen,FiO2 (RAPIDLAB),50.00,,,%,,Arizona


In [14]:
#checking the missing values
print("Training set missing values:\n", lab_train_data.isna().sum())

Training set missing values:
 Internalpatientid              0
Age at lab test                0
Lab test date                  0
Lab test                       0
Lab test description           0
Result numeric           8529467
Result textual          59854531
Result range            12856109
Result units             8265710
Specimen source         29142239
State                          0
dtype: int64


# Replace NaN values in "Result numeric" with corresponding values from "Result textual"

In [15]:
# Replace NaN values in "Result numeric" with corresponding values from "Result textual"
lab_train_data['Result'] = lab_train_data['Result numeric'].fillna(lab_train_data['Result textual'])

# Print the updated DataFrame
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
6739711,6,74.84225,2001-07-10 20:13:05,baso #_k/cumm_no specimen,BASO #,0.10,,0 - .5,k/cumm,,Texas,0.1
6739712,6,74.84225,2001-07-10 20:13:05,hct_%_blood,Hct,39.10,,42 - 52,%,blood,Texas,39.1
6739713,6,74.84225,2001-07-10 20:13:05,rbc_m/cmm_blood,RBC,4.54,,4.7 - 6.1,M/cmm,blood,Texas,4.54
6739714,6,74.84225,2001-07-10 20:13:05,eos #_k/cumm_no specimen,EOS #,0.20,,0 - 7,K/cumm,,Texas,0.2
6739715,6,74.84225,2001-07-10 20:13:05,neut %_%_no specimen,Neut %,60.18,,37 - 73,%,,Texas,60.18
...,...,...,...,...,...,...,...,...,...,...,...,...
51731557,169065,53.31840,2011-06-10 21:00:09,po2(t) (rapidlab)_mmhg_no specimen,pO2(T) (RAPIDLAB),106.64,,70.0 - 90.0,mmHg,,Arizona,106.64
51731558,169065,53.31840,2011-06-10 21:00:09,glucose (rapidlab)_mg/dl_no specimen,GLUCOSE (RAPIDLAB),109.00,,60 - 105,mg/dL,,Arizona,109
51731559,169065,53.31840,2011-06-10 21:00:09,ph (rapidlab)_null_no specimen,pH (RAPIDLAB),7.41,,7.35 - 7.45,,,Arizona,7.41
51731560,169065,53.31840,2011-06-10 21:00:09,fio2 (rapidlab)_%_no specimen,FiO2 (RAPIDLAB),50.00,,,%,,Arizona,50


In [16]:
# Check for missing or NaN values in the 'Result' column(before)
missing_data = lab_train_data['Result'].isnull().sum()


if missing_data > 0:
    print(f"There are {missing_data} missing or NaN values in the 'Result' column.")
else:
    print("No missing or NaN values found in the 'Result' column.")

There are 27948 missing or NaN values in the 'Result' column.


* **Since both "Result Numeric" and "Result Textual" are missing, we may really consider that value to be empty in the "Result"**( Actual test results are not available.)

In [17]:
# Remove rows missing values in the "result" column
lab_train_data.dropna(subset=["Result"],inplace=True)

In [18]:
lab_train_data = lab_train_data

In [19]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
6739711,6,74.84225,2001-07-10 20:13:05,baso #_k/cumm_no specimen,BASO #,0.10,,0 - .5,k/cumm,,Texas,0.1
6739712,6,74.84225,2001-07-10 20:13:05,hct_%_blood,Hct,39.10,,42 - 52,%,blood,Texas,39.1
6739713,6,74.84225,2001-07-10 20:13:05,rbc_m/cmm_blood,RBC,4.54,,4.7 - 6.1,M/cmm,blood,Texas,4.54
6739714,6,74.84225,2001-07-10 20:13:05,eos #_k/cumm_no specimen,EOS #,0.20,,0 - 7,K/cumm,,Texas,0.2
6739715,6,74.84225,2001-07-10 20:13:05,neut %_%_no specimen,Neut %,60.18,,37 - 73,%,,Texas,60.18
...,...,...,...,...,...,...,...,...,...,...,...,...
51731557,169065,53.31840,2011-06-10 21:00:09,po2(t) (rapidlab)_mmhg_no specimen,pO2(T) (RAPIDLAB),106.64,,70.0 - 90.0,mmHg,,Arizona,106.64
51731558,169065,53.31840,2011-06-10 21:00:09,glucose (rapidlab)_mg/dl_no specimen,GLUCOSE (RAPIDLAB),109.00,,60 - 105,mg/dL,,Arizona,109
51731559,169065,53.31840,2011-06-10 21:00:09,ph (rapidlab)_null_no specimen,pH (RAPIDLAB),7.41,,7.35 - 7.45,,,Arizona,7.41
51731560,169065,53.31840,2011-06-10 21:00:09,fio2 (rapidlab)_%_no specimen,FiO2 (RAPIDLAB),50.00,,,%,,Arizona,50


In [20]:
# Check for missing or NaN values in the 'Result' column(after removing)
missing_data = lab_train_data['Result'].isnull().sum()


if missing_data > 0:
    print(f"There are {missing_data} missing or NaN values in the 'Result' column.")
else:
    print("No missing or NaN values found in the 'Result' column.")

No missing or NaN values found in the 'Result' column.


* **Missing values has been removed from the "Result" column**

# Taking the three digits in age column

In [21]:
# Format 'Age at lab test' values in the format 00.00
lab_train_data["Age at lab test"] =lab_train_data["Age at lab test"].map("{:.3f}".format)

In [22]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
6739711,6,74.842,2001-07-10 20:13:05,baso #_k/cumm_no specimen,BASO #,0.10,,0 - .5,k/cumm,,Texas,0.1
6739712,6,74.842,2001-07-10 20:13:05,hct_%_blood,Hct,39.10,,42 - 52,%,blood,Texas,39.1
6739713,6,74.842,2001-07-10 20:13:05,rbc_m/cmm_blood,RBC,4.54,,4.7 - 6.1,M/cmm,blood,Texas,4.54
6739714,6,74.842,2001-07-10 20:13:05,eos #_k/cumm_no specimen,EOS #,0.20,,0 - 7,K/cumm,,Texas,0.2
6739715,6,74.842,2001-07-10 20:13:05,neut %_%_no specimen,Neut %,60.18,,37 - 73,%,,Texas,60.18
...,...,...,...,...,...,...,...,...,...,...,...,...
51731557,169065,53.318,2011-06-10 21:00:09,po2(t) (rapidlab)_mmhg_no specimen,pO2(T) (RAPIDLAB),106.64,,70.0 - 90.0,mmHg,,Arizona,106.64
51731558,169065,53.318,2011-06-10 21:00:09,glucose (rapidlab)_mg/dl_no specimen,GLUCOSE (RAPIDLAB),109.00,,60 - 105,mg/dL,,Arizona,109
51731559,169065,53.318,2011-06-10 21:00:09,ph (rapidlab)_null_no specimen,pH (RAPIDLAB),7.41,,7.35 - 7.45,,,Arizona,7.41
51731560,169065,53.318,2011-06-10 21:00:09,fio2 (rapidlab)_%_no specimen,FiO2 (RAPIDLAB),50.00,,,%,,Arizona,50


# Taking the Max age

In [23]:
# Find the latest record for each internal patient id
max_ages = lab_train_data.groupby('Internalpatientid')['Age at lab test'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
lab_train_data = pd.merge(lab_train_data, max_ages, on =['Internalpatientid','Age at lab test'], how = 'inner')

lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,6,87.815,2014-07-04 12:23:03,fingerstick inr (coaguchek)_inr_blood,FINGERSTICK INR (COAGUCHEK),2.57,,0.9 - 4.4,INR,blood,Texas,2.57
1,6,87.815,2014-07-04 12:23:03,fingerstick protime (coaguchek)_sec_blood,FINGERSTICK PROTIME (COAGUCHEK),29.00,,,SEC,blood,Texas,29
2,7,74.156,2020-09-04 21:15:42,rbc_m/ul_no specimen,RBC,4.79,,4.0 - 6.0,M/uL,,Nebraska,4.79
3,7,74.156,2020-09-04 21:15:42,abs monocyte_k/ul_no specimen,ABS MONOCYTE,0.71,,0.2 - 1.1,K/uL,,Nebraska,0.71
4,7,74.156,2020-09-04 21:15:42,basophil %_%_no specimen,BASOPHIL %,0.29,,0.0 - 1.0,%,,Nebraska,0.29
...,...,...,...,...,...,...,...,...,...,...,...,...
900143,169065,53.318,2011-06-10 21:00:09,po2(t) (rapidlab)_mmhg_no specimen,pO2(T) (RAPIDLAB),106.64,,70.0 - 90.0,mmHg,,Arizona,106.64
900144,169065,53.318,2011-06-10 21:00:09,glucose (rapidlab)_mg/dl_no specimen,GLUCOSE (RAPIDLAB),109.00,,60 - 105,mg/dL,,Arizona,109
900145,169065,53.318,2011-06-10 21:00:09,ph (rapidlab)_null_no specimen,pH (RAPIDLAB),7.41,,7.35 - 7.45,,,Arizona,7.41
900146,169065,53.318,2011-06-10 21:00:09,fio2 (rapidlab)_%_no specimen,FiO2 (RAPIDLAB),50.00,,,%,,Arizona,50


In [24]:
# Convert 'Age at lab test' column from object to float
lab_train_data["Age at lab test"] = lab_train_data["Age at lab test"].astype(float)

In [25]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,6,87.815,2014-07-04 12:23:03,fingerstick inr (coaguchek)_inr_blood,FINGERSTICK INR (COAGUCHEK),2.57,,0.9 - 4.4,INR,blood,Texas,2.57
1,6,87.815,2014-07-04 12:23:03,fingerstick protime (coaguchek)_sec_blood,FINGERSTICK PROTIME (COAGUCHEK),29.00,,,SEC,blood,Texas,29
2,7,74.156,2020-09-04 21:15:42,rbc_m/ul_no specimen,RBC,4.79,,4.0 - 6.0,M/uL,,Nebraska,4.79
3,7,74.156,2020-09-04 21:15:42,abs monocyte_k/ul_no specimen,ABS MONOCYTE,0.71,,0.2 - 1.1,K/uL,,Nebraska,0.71
4,7,74.156,2020-09-04 21:15:42,basophil %_%_no specimen,BASOPHIL %,0.29,,0.0 - 1.0,%,,Nebraska,0.29
...,...,...,...,...,...,...,...,...,...,...,...,...
900143,169065,53.318,2011-06-10 21:00:09,po2(t) (rapidlab)_mmhg_no specimen,pO2(T) (RAPIDLAB),106.64,,70.0 - 90.0,mmHg,,Arizona,106.64
900144,169065,53.318,2011-06-10 21:00:09,glucose (rapidlab)_mg/dl_no specimen,GLUCOSE (RAPIDLAB),109.00,,60 - 105,mg/dL,,Arizona,109
900145,169065,53.318,2011-06-10 21:00:09,ph (rapidlab)_null_no specimen,pH (RAPIDLAB),7.41,,7.35 - 7.45,,,Arizona,7.41
900146,169065,53.318,2011-06-10 21:00:09,fio2 (rapidlab)_%_no specimen,FiO2 (RAPIDLAB),50.00,,,%,,Arizona,50


# Round off the Age column

In [26]:
lab_train_data['Age at lab test'] = lab_train_data['Age at lab test'].apply(lambda x: round(x,))

In [27]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,6,88,2014-07-04 12:23:03,fingerstick inr (coaguchek)_inr_blood,FINGERSTICK INR (COAGUCHEK),2.57,,0.9 - 4.4,INR,blood,Texas,2.57
1,6,88,2014-07-04 12:23:03,fingerstick protime (coaguchek)_sec_blood,FINGERSTICK PROTIME (COAGUCHEK),29.00,,,SEC,blood,Texas,29
2,7,74,2020-09-04 21:15:42,rbc_m/ul_no specimen,RBC,4.79,,4.0 - 6.0,M/uL,,Nebraska,4.79
3,7,74,2020-09-04 21:15:42,abs monocyte_k/ul_no specimen,ABS MONOCYTE,0.71,,0.2 - 1.1,K/uL,,Nebraska,0.71
4,7,74,2020-09-04 21:15:42,basophil %_%_no specimen,BASOPHIL %,0.29,,0.0 - 1.0,%,,Nebraska,0.29
...,...,...,...,...,...,...,...,...,...,...,...,...
900143,169065,53,2011-06-10 21:00:09,po2(t) (rapidlab)_mmhg_no specimen,pO2(T) (RAPIDLAB),106.64,,70.0 - 90.0,mmHg,,Arizona,106.64
900144,169065,53,2011-06-10 21:00:09,glucose (rapidlab)_mg/dl_no specimen,GLUCOSE (RAPIDLAB),109.00,,60 - 105,mg/dL,,Arizona,109
900145,169065,53,2011-06-10 21:00:09,ph (rapidlab)_null_no specimen,pH (RAPIDLAB),7.41,,7.35 - 7.45,,,Arizona,7.41
900146,169065,53,2011-06-10 21:00:09,fio2 (rapidlab)_%_no specimen,FiO2 (RAPIDLAB),50.00,,,%,,Arizona,50


In [28]:
lab_train_data["Internalpatientid"].nunique()

34055

In [29]:
df_3 = lab_train_data["Internalpatientid"].value_counts().loc[lambda x: x>80].to_frame()

In [30]:
df = df_3.reset_index()

In [31]:
df.columns = ["Internalpatientid","count"]

In [32]:
df.drop(["count"],axis=1,inplace =True)

In [33]:
df = df

In [35]:
len(df)

421

In [36]:
# Merge the two dataframes based on 'Internalpatientid'
merged_df = lab_train_data.merge(df, on='Internalpatientid', how='left')

# Filter rows from DataFrame 1 that have matching values in DataFrame 2
filtered_df = merged_df[~merged_df['Internalpatientid'].isin(df['Internalpatientid'])]

# Print only the columns of DataFrame 1
filtered_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,6,88,2014-07-04 12:23:03,fingerstick inr (coaguchek)_inr_blood,FINGERSTICK INR (COAGUCHEK),2.57,,0.9 - 4.4,INR,blood,Texas,2.57
1,6,88,2014-07-04 12:23:03,fingerstick protime (coaguchek)_sec_blood,FINGERSTICK PROTIME (COAGUCHEK),29.00,,,SEC,blood,Texas,29
2,7,74,2020-09-04 21:15:42,rbc_m/ul_no specimen,RBC,4.79,,4.0 - 6.0,M/uL,,Nebraska,4.79
3,7,74,2020-09-04 21:15:42,abs monocyte_k/ul_no specimen,ABS MONOCYTE,0.71,,0.2 - 1.1,K/uL,,Nebraska,0.71
4,7,74,2020-09-04 21:15:42,basophil %_%_no specimen,BASOPHIL %,0.29,,0.0 - 1.0,%,,Nebraska,0.29
...,...,...,...,...,...,...,...,...,...,...,...,...
900143,169065,53,2011-06-10 21:00:09,po2(t) (rapidlab)_mmhg_no specimen,pO2(T) (RAPIDLAB),106.64,,70.0 - 90.0,mmHg,,Arizona,106.64
900144,169065,53,2011-06-10 21:00:09,glucose (rapidlab)_mg/dl_no specimen,GLUCOSE (RAPIDLAB),109.00,,60 - 105,mg/dL,,Arizona,109
900145,169065,53,2011-06-10 21:00:09,ph (rapidlab)_null_no specimen,pH (RAPIDLAB),7.41,,7.35 - 7.45,,,Arizona,7.41
900146,169065,53,2011-06-10 21:00:09,fio2 (rapidlab)_%_no specimen,FiO2 (RAPIDLAB),50.00,,,%,,Arizona,50


In [37]:
lab_train_data = filtered_df 

In [38]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,6,88,2014-07-04 12:23:03,fingerstick inr (coaguchek)_inr_blood,FINGERSTICK INR (COAGUCHEK),2.57,,0.9 - 4.4,INR,blood,Texas,2.57
1,6,88,2014-07-04 12:23:03,fingerstick protime (coaguchek)_sec_blood,FINGERSTICK PROTIME (COAGUCHEK),29.00,,,SEC,blood,Texas,29
2,7,74,2020-09-04 21:15:42,rbc_m/ul_no specimen,RBC,4.79,,4.0 - 6.0,M/uL,,Nebraska,4.79
3,7,74,2020-09-04 21:15:42,abs monocyte_k/ul_no specimen,ABS MONOCYTE,0.71,,0.2 - 1.1,K/uL,,Nebraska,0.71
4,7,74,2020-09-04 21:15:42,basophil %_%_no specimen,BASOPHIL %,0.29,,0.0 - 1.0,%,,Nebraska,0.29
...,...,...,...,...,...,...,...,...,...,...,...,...
900143,169065,53,2011-06-10 21:00:09,po2(t) (rapidlab)_mmhg_no specimen,pO2(T) (RAPIDLAB),106.64,,70.0 - 90.0,mmHg,,Arizona,106.64
900144,169065,53,2011-06-10 21:00:09,glucose (rapidlab)_mg/dl_no specimen,GLUCOSE (RAPIDLAB),109.00,,60 - 105,mg/dL,,Arizona,109
900145,169065,53,2011-06-10 21:00:09,ph (rapidlab)_null_no specimen,pH (RAPIDLAB),7.41,,7.35 - 7.45,,,Arizona,7.41
900146,169065,53,2011-06-10 21:00:09,fio2 (rapidlab)_%_no specimen,FiO2 (RAPIDLAB),50.00,,,%,,Arizona,50


# Adding  "Age at lab test",'Lab test description' & 'Result (Result numeric & Result textual)' columns with '_'

In [39]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
lab_train_data ['Lab_test_description_Result'] = lab_train_data['Age at lab test'].astype(str) + '_' +  lab_train_data['Lab test description'] + '_' +  lab_train_data['Result'].astype(str)

In [40]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result,Lab_test_description_Result
0,6,88,2014-07-04 12:23:03,fingerstick inr (coaguchek)_inr_blood,FINGERSTICK INR (COAGUCHEK),2.57,,0.9 - 4.4,INR,blood,Texas,2.57,88_FINGERSTICK INR (COAGUCHEK)_2.57
1,6,88,2014-07-04 12:23:03,fingerstick protime (coaguchek)_sec_blood,FINGERSTICK PROTIME (COAGUCHEK),29.00,,,SEC,blood,Texas,29,88_FINGERSTICK PROTIME (COAGUCHEK)_29.0
2,7,74,2020-09-04 21:15:42,rbc_m/ul_no specimen,RBC,4.79,,4.0 - 6.0,M/uL,,Nebraska,4.79,74_RBC_4.79
3,7,74,2020-09-04 21:15:42,abs monocyte_k/ul_no specimen,ABS MONOCYTE,0.71,,0.2 - 1.1,K/uL,,Nebraska,0.71,74_ABS MONOCYTE_0.71
4,7,74,2020-09-04 21:15:42,basophil %_%_no specimen,BASOPHIL %,0.29,,0.0 - 1.0,%,,Nebraska,0.29,74_BASOPHIL %_0.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...
900143,169065,53,2011-06-10 21:00:09,po2(t) (rapidlab)_mmhg_no specimen,pO2(T) (RAPIDLAB),106.64,,70.0 - 90.0,mmHg,,Arizona,106.64,53_pO2(T) (RAPIDLAB)_106.64
900144,169065,53,2011-06-10 21:00:09,glucose (rapidlab)_mg/dl_no specimen,GLUCOSE (RAPIDLAB),109.00,,60 - 105,mg/dL,,Arizona,109,53_GLUCOSE (RAPIDLAB)_109.0
900145,169065,53,2011-06-10 21:00:09,ph (rapidlab)_null_no specimen,pH (RAPIDLAB),7.41,,7.35 - 7.45,,,Arizona,7.41,53_pH (RAPIDLAB)_7.41
900146,169065,53,2011-06-10 21:00:09,fio2 (rapidlab)_%_no specimen,FiO2 (RAPIDLAB),50.00,,,%,,Arizona,50,53_FiO2 (RAPIDLAB)_50.0


# Dropping all unnecessary columns

In [42]:
# Remove columns 
lab_train_data.drop(["Age at lab test","Lab test date","Lab test","Lab test description","Result numeric","Result textual",
                     "Result range","Result units","Specimen source","State","Result"], axis=1,inplace=True)

In [43]:
lab_train_data = lab_train_data

In [44]:
lab_train_data # After removing the unwanted column

Unnamed: 0,Internalpatientid,Lab_test_description_Result
0,6,88_FINGERSTICK INR (COAGUCHEK)_2.57
1,6,88_FINGERSTICK PROTIME (COAGUCHEK)_29.0
2,7,74_RBC_4.79
3,7,74_ABS MONOCYTE_0.71
4,7,74_BASOPHIL %_0.29
...,...,...
900143,169065,53_pO2(T) (RAPIDLAB)_106.64
900144,169065,53_GLUCOSE (RAPIDLAB)_109.0
900145,169065,53_pH (RAPIDLAB)_7.41
900146,169065,53_FiO2 (RAPIDLAB)_50.0


In [45]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = lab_train_data.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_lab = df_grouped.reset_index()
df_grouped_lab

Unnamed: 0,Internalpatientid,Lab_test_description_Result
0,6,"88_FINGERSTICK INR (COAGUCHEK)_2.57,88_FINGERS..."
1,7,"74_RBC_4.79,74_ABS MONOCYTE_0.71,74_BASOPHIL %..."
2,9,"52_Bands_3.0,52_RDW_21.2*,52_ZZZAUTO ANISO_++,..."
3,12,"74_LDL-CHOL CALCULATION_69.74,74_SODIUM (blood..."
4,17,"82_HCO3-_12.0,82_ICTERUS INDEX_2.0,82_SODIUM_1..."
...,...,...
33629,169037,"88_Mch_30.38,88_RBC_4.2,88_WBC_6.91,88_RDW-CV_..."
33630,169045,"86_UREA NITROGEN_28.0,86_NON HDL CHOLESTEROL_1..."
33631,169058,"79_GLUCOSE,BLOOD-poct (STL)_151.0,79_GLUCOSE,..."
33632,169059,"91_SODIUM_133.0,91_LDL CALCULATION_106.0,91_AL..."


In [48]:
df_grouped_lab["Lab_test_description_Result"].values[33632] # checking the one row

'91_SODIUM_133.0,91_LDL CALCULATION_106.0,91_ALBUMIN_3.73,91_POTASSIUM_4.04,91_GLUCOSE_106.0,91_CREATININE_1.0,91_TRIGLYCERIDE_235.0,91_CHOLESTEROL_214.0,91_BILIRUBIN, TOTAL_0.68,91_ALANINE AMINOTRANSFERASE_12.0,91_CARBON DIOXIDE_20.0,91_Anion gap_11.0,91_TSH_1.94,91_CHLORIDE_106.0,91_DIRECT HDL_53.0,91_eGFR_52.0,91_CALCIUM_9.44,91_UREA NITROGEN_26.0,91_TOTAL PROTEIN_7.79,91_ALKALINE PHOSPHATASE_52.0,91_ASPARTATE AMINOTRANSFERASE_16.0,91_NEUT %_57.85,91_Lymph %_29.3,91_LYMPH #_1.5,91_RDW_15.98,91_MCHC_32.34,91_RBC_4.09,91_NEUT #_3.0,91_BASO #_0.0,91_MONO #_0.3,91_MPV_10.08,91_EOS #_0.29,91_WBC_5.0,91_Eos %_5.29,91_Mono %_5.27,91_Hct_41.75,91_MCH_32.18,91_Baso %_0.52,91_PLATELET_191.0,91_MCV_98.7,91_Hgb_13.9'

# Saving the CSV file

In [49]:
# saving the csv file 

#df.to_csv('Path where you want to store the exported CSV file/File Name.csv')
df_grouped_lab.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_test/df_lab_results_test_with_age_v1.csv')