# **Azure notebook Setup**

In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
#['data_team3_synthetic_train']
datastore = workspace.datastores['data_team3_synthetic_quality_check'] 

In [3]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'lab_results_qual.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [4]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
lab_train_data = dataset.to_pandas_dataframe()

In [5]:
lab_train_data

Unnamed: 0,Column1,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State
0,10813,100012,53.085902,2002-08-01 21:31:49,potassium_mmol/l_plasma,POTASSIUM,3.442590,,3.5 - 4.8,mmol/L,plasma,New Mexico
1,10814,100012,53.085902,2002-08-01 21:31:49,urea nitrogen_mg/dl_plasma,UREA NITROGEN,24.000000,,7 - 20,mg/dL,plasma,New Mexico
2,10815,100012,53.085902,2002-08-01 21:31:49,creatinine_mg/dl_plasma,CREATININE,1.141091,,.8 - 1.6,mg/dL,plasma,New Mexico
3,10816,100012,53.085902,2002-08-01 21:31:49,magnesium_mg/dl_plasma,MAGNESIUM,2.263068,,1.6 - 2.4,mg/dL,plasma,New Mexico
4,10817,100012,53.085902,2002-08-01 21:31:49,albumin_g/dl_plasma,ALBUMIN,3.896010,,3.5 - 4.8,g/dL,plasma,New Mexico
...,...,...,...,...,...,...,...,...,...,...,...,...
1970526,328841152,51551,60.910378,2015-07-30 17:50:06,absolute lymphocyte count_k/cmm_blood,ABSOLUTE LYMPHOCYTE COUNT,1.246384,,.8 - 5.6,K/cmm,blood,Ohio
1970527,328841153,51551,60.910378,2015-07-30 17:50:06,absolute basophil count_k/cmm_blood,ABSOLUTE BASOPHIL COUNT,0.090993,,0 - .3,K/cmm,blood,Ohio
1970528,328841154,51551,60.910378,2015-07-30 17:50:06,hgb_g/dl_blood,Hgb,13.851840,,13.6 - 17.4,g/dL,blood,Ohio
1970529,328841155,51551,60.914041,2015-08-01 01:58:13,ptt_sec_plasma,PTT,65.000000,,26 - 36,Sec,plasma,Ohio


In [6]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

# Set the maximum number of rows to be displayed to 1000
pd.set_option('display.max_rows', 1163)

# Elimination of the unnamed columns

In [7]:
# Remove unwanted columns 
lab_train_data.drop('Column1',axis=1,inplace=True)

In [8]:
lab_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1970531 entries, 0 to 1970530
Data columns (total 11 columns):
 #   Column                Dtype         
---  ------                -----         
 0   Internalpatientid     int64         
 1   Age at lab test       float64       
 2   Lab test date         datetime64[ns]
 3   Lab test              object        
 4   Lab test description  object        
 5   Result numeric        float64       
 6   Result textual        object        
 7   Result range          object        
 8   Result units          object        
 9   Specimen source       object        
 10  State                 object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(7)
memory usage: 165.4+ MB


# Sorting out patients id and age column

In [9]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
lab_train_data.sort_values(["Internalpatientid","Age at lab test"],inplace=True)

In [10]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State
428845,67,48.278794,2010-08-07 21:34:40,creatinine idms_mg/dl_serum,CREATININE idms,1.942674,,.66 - 1.28,mg/dL,serum,California
428846,67,48.278794,2010-08-07 21:34:40,triglyceride_mg/dl_serum,TRIGLYCERIDE,64.000000,,40 - 160,mg/dL,serum,California
428847,67,48.278794,2010-08-07 21:34:40,carbon dioxide_mmol/l_serum,CARBON DIOXIDE,33.000000,,24.0 - 31.0,mmol/L,serum,California
428848,67,48.278794,2010-08-07 21:34:40,alanine aminotransferase_u/l_serum,ALANINE AMINOTRANSFERASE,27.000000,,7.0 - 45.0,U/L,serum,California
428849,67,48.278794,2010-08-07 21:34:40,hdl_mg/dl_serum,HDL,41.000000,,,mg/dL,serum,California
...,...,...,...,...,...,...,...,...,...,...,...
1480528,168899,93.715799,2016-07-16 02:59:33,sgot (cx)_iu/l_plasma,SGOT (CX),26.000000,,10 - 42,IU/L,plasma,Pennsylvania
1480529,168899,93.715799,2016-07-16 02:59:33,egfr_null_plasma,eGFR,93.000000,,,,plasma,Pennsylvania
1480530,168899,93.715799,2016-07-16 02:59:33,direct bilirubin (cx)_mg/dl_plasma,DIRECT BILIRUBIN (CX),0.292628,,0.0 - 0.2,mg/dL,plasma,Pennsylvania
1480531,168899,93.715799,2016-07-16 02:59:33,creatinine (cx)_mg/dl_plasma,CREATININE (CX),0.795907,,0.5 - 1.2,mg/dL,plasma,Pennsylvania


# Round off the "Result numeric" columns with 2 digits

In [11]:
lab_train_data['Result numeric'] = lab_train_data['Result numeric'].apply(lambda x: round(x,2))

In [12]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State
428845,67,48.278794,2010-08-07 21:34:40,creatinine idms_mg/dl_serum,CREATININE idms,1.94,,.66 - 1.28,mg/dL,serum,California
428846,67,48.278794,2010-08-07 21:34:40,triglyceride_mg/dl_serum,TRIGLYCERIDE,64.00,,40 - 160,mg/dL,serum,California
428847,67,48.278794,2010-08-07 21:34:40,carbon dioxide_mmol/l_serum,CARBON DIOXIDE,33.00,,24.0 - 31.0,mmol/L,serum,California
428848,67,48.278794,2010-08-07 21:34:40,alanine aminotransferase_u/l_serum,ALANINE AMINOTRANSFERASE,27.00,,7.0 - 45.0,U/L,serum,California
428849,67,48.278794,2010-08-07 21:34:40,hdl_mg/dl_serum,HDL,41.00,,,mg/dL,serum,California
...,...,...,...,...,...,...,...,...,...,...,...
1480528,168899,93.715799,2016-07-16 02:59:33,sgot (cx)_iu/l_plasma,SGOT (CX),26.00,,10 - 42,IU/L,plasma,Pennsylvania
1480529,168899,93.715799,2016-07-16 02:59:33,egfr_null_plasma,eGFR,93.00,,,,plasma,Pennsylvania
1480530,168899,93.715799,2016-07-16 02:59:33,direct bilirubin (cx)_mg/dl_plasma,DIRECT BILIRUBIN (CX),0.29,,0.0 - 0.2,mg/dL,plasma,Pennsylvania
1480531,168899,93.715799,2016-07-16 02:59:33,creatinine (cx)_mg/dl_plasma,CREATININE (CX),0.80,,0.5 - 1.2,mg/dL,plasma,Pennsylvania


In [13]:
#checking the missing values
print("Training set missing values:\n", lab_train_data.isna().sum())

Training set missing values:
 Internalpatientid             0
Age at lab test               0
Lab test date                 0
Lab test                      0
Lab test description          0
Result numeric           247101
Result textual          1724172
Result range             373539
Result units             243879
Specimen source          861662
State                         0
dtype: int64


# Replace NaN values in "Result numeric" with corresponding values from "Result textual"

In [14]:
# Replace NaN values in "Result numeric" with corresponding values from "Result textual"
lab_train_data['Result'] = lab_train_data['Result numeric'].fillna(lab_train_data['Result textual'])

# Print the updated DataFrame
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
428845,67,48.278794,2010-08-07 21:34:40,creatinine idms_mg/dl_serum,CREATININE idms,1.94,,.66 - 1.28,mg/dL,serum,California,1.94
428846,67,48.278794,2010-08-07 21:34:40,triglyceride_mg/dl_serum,TRIGLYCERIDE,64.00,,40 - 160,mg/dL,serum,California,64
428847,67,48.278794,2010-08-07 21:34:40,carbon dioxide_mmol/l_serum,CARBON DIOXIDE,33.00,,24.0 - 31.0,mmol/L,serum,California,33
428848,67,48.278794,2010-08-07 21:34:40,alanine aminotransferase_u/l_serum,ALANINE AMINOTRANSFERASE,27.00,,7.0 - 45.0,U/L,serum,California,27
428849,67,48.278794,2010-08-07 21:34:40,hdl_mg/dl_serum,HDL,41.00,,,mg/dL,serum,California,41
...,...,...,...,...,...,...,...,...,...,...,...,...
1480528,168899,93.715799,2016-07-16 02:59:33,sgot (cx)_iu/l_plasma,SGOT (CX),26.00,,10 - 42,IU/L,plasma,Pennsylvania,26
1480529,168899,93.715799,2016-07-16 02:59:33,egfr_null_plasma,eGFR,93.00,,,,plasma,Pennsylvania,93
1480530,168899,93.715799,2016-07-16 02:59:33,direct bilirubin (cx)_mg/dl_plasma,DIRECT BILIRUBIN (CX),0.29,,0.0 - 0.2,mg/dL,plasma,Pennsylvania,0.29
1480531,168899,93.715799,2016-07-16 02:59:33,creatinine (cx)_mg/dl_plasma,CREATININE (CX),0.80,,0.5 - 1.2,mg/dL,plasma,Pennsylvania,0.8


In [15]:
# Check for missing or NaN values in the 'Result' column(before)
missing_data = lab_train_data['Result'].isnull().sum()


if missing_data > 0:
    print(f"There are {missing_data} missing or NaN values in the 'Result' column.")
else:
    print("No missing or NaN values found in the 'Result' column.")

There are 742 missing or NaN values in the 'Result' column.


* **Since both "Result Numeric" and "Result Textual" are missing, we may really consider that value to be empty in the "Result"**( Actual test results are not available.)

In [16]:
# Remove rows missing values in the "result" column
lab_train_data.dropna(subset=["Result"],inplace=True)

In [17]:
lab_train_data = lab_train_data

In [18]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
428845,67,48.278794,2010-08-07 21:34:40,creatinine idms_mg/dl_serum,CREATININE idms,1.94,,.66 - 1.28,mg/dL,serum,California,1.94
428846,67,48.278794,2010-08-07 21:34:40,triglyceride_mg/dl_serum,TRIGLYCERIDE,64.00,,40 - 160,mg/dL,serum,California,64
428847,67,48.278794,2010-08-07 21:34:40,carbon dioxide_mmol/l_serum,CARBON DIOXIDE,33.00,,24.0 - 31.0,mmol/L,serum,California,33
428848,67,48.278794,2010-08-07 21:34:40,alanine aminotransferase_u/l_serum,ALANINE AMINOTRANSFERASE,27.00,,7.0 - 45.0,U/L,serum,California,27
428849,67,48.278794,2010-08-07 21:34:40,hdl_mg/dl_serum,HDL,41.00,,,mg/dL,serum,California,41
...,...,...,...,...,...,...,...,...,...,...,...,...
1480528,168899,93.715799,2016-07-16 02:59:33,sgot (cx)_iu/l_plasma,SGOT (CX),26.00,,10 - 42,IU/L,plasma,Pennsylvania,26
1480529,168899,93.715799,2016-07-16 02:59:33,egfr_null_plasma,eGFR,93.00,,,,plasma,Pennsylvania,93
1480530,168899,93.715799,2016-07-16 02:59:33,direct bilirubin (cx)_mg/dl_plasma,DIRECT BILIRUBIN (CX),0.29,,0.0 - 0.2,mg/dL,plasma,Pennsylvania,0.29
1480531,168899,93.715799,2016-07-16 02:59:33,creatinine (cx)_mg/dl_plasma,CREATININE (CX),0.80,,0.5 - 1.2,mg/dL,plasma,Pennsylvania,0.8


In [19]:
# Check for missing or NaN values in the 'Result' column(after removing)
missing_data = lab_train_data['Result'].isnull().sum()


if missing_data > 0:
    print(f"There are {missing_data} missing or NaN values in the 'Result' column.")
else:
    print("No missing or NaN values found in the 'Result' column.")

No missing or NaN values found in the 'Result' column.


* **Missing values has been removed from the "Result" column**

# Taking the digits in age 

In [20]:
# Format 'Age at measurement' values in the format 00.00
lab_train_data["Age at lab test"] =lab_train_data["Age at lab test"].map("{:.3f}".format)

In [21]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
428845,67,48.279,2010-08-07 21:34:40,creatinine idms_mg/dl_serum,CREATININE idms,1.94,,.66 - 1.28,mg/dL,serum,California,1.94
428846,67,48.279,2010-08-07 21:34:40,triglyceride_mg/dl_serum,TRIGLYCERIDE,64.00,,40 - 160,mg/dL,serum,California,64
428847,67,48.279,2010-08-07 21:34:40,carbon dioxide_mmol/l_serum,CARBON DIOXIDE,33.00,,24.0 - 31.0,mmol/L,serum,California,33
428848,67,48.279,2010-08-07 21:34:40,alanine aminotransferase_u/l_serum,ALANINE AMINOTRANSFERASE,27.00,,7.0 - 45.0,U/L,serum,California,27
428849,67,48.279,2010-08-07 21:34:40,hdl_mg/dl_serum,HDL,41.00,,,mg/dL,serum,California,41
...,...,...,...,...,...,...,...,...,...,...,...,...
1480528,168899,93.716,2016-07-16 02:59:33,sgot (cx)_iu/l_plasma,SGOT (CX),26.00,,10 - 42,IU/L,plasma,Pennsylvania,26
1480529,168899,93.716,2016-07-16 02:59:33,egfr_null_plasma,eGFR,93.00,,,,plasma,Pennsylvania,93
1480530,168899,93.716,2016-07-16 02:59:33,direct bilirubin (cx)_mg/dl_plasma,DIRECT BILIRUBIN (CX),0.29,,0.0 - 0.2,mg/dL,plasma,Pennsylvania,0.29
1480531,168899,93.716,2016-07-16 02:59:33,creatinine (cx)_mg/dl_plasma,CREATININE (CX),0.80,,0.5 - 1.2,mg/dL,plasma,Pennsylvania,0.8


# max age

In [22]:
# Find the maximum age for each internal patient id
max_ages = lab_train_data.groupby('Internalpatientid')['Age at lab test'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
lab_train_data = pd.merge(lab_train_data, max_ages, on =['Internalpatientid','Age at lab test'], how = 'inner')

lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,67,58.499,2020-10-29 08:48:01,fit_null_no specimen,FIT,,Negative,,,,California,Negative
1,200,87.520,2022-11-02 14:39:56,mchc_g/dl_no specimen,Mchc,33.94,,32.9 - 37,g/dL,,Utah,33.94
2,200,87.520,2022-11-02 14:39:56,mch_pg_no specimen,Mch,31.07,,27 - 36,pg,,Utah,31.07
3,200,87.520,2022-11-02 14:39:56,rdw-cv_%_no specimen,RDW-CV,13.70,,11.2 - 14.5,%,,Utah,13.7
4,200,87.520,2022-11-02 14:39:56,hct_%_no specimen,Hct,47.00,,37.1 - 52.5,%,,Utah,47
...,...,...,...,...,...,...,...,...,...,...,...,...
26717,168899,93.716,2016-07-16 02:59:33,sgot (cx)_iu/l_plasma,SGOT (CX),26.00,,10 - 42,IU/L,plasma,Pennsylvania,26
26718,168899,93.716,2016-07-16 02:59:33,egfr_null_plasma,eGFR,93.00,,,,plasma,Pennsylvania,93
26719,168899,93.716,2016-07-16 02:59:33,direct bilirubin (cx)_mg/dl_plasma,DIRECT BILIRUBIN (CX),0.29,,0.0 - 0.2,mg/dL,plasma,Pennsylvania,0.29
26720,168899,93.716,2016-07-16 02:59:33,creatinine (cx)_mg/dl_plasma,CREATININE (CX),0.80,,0.5 - 1.2,mg/dL,plasma,Pennsylvania,0.8


In [23]:
# Convert 'Age at measurement' column from object to float
lab_train_data["Age at lab test"] = lab_train_data["Age at lab test"].astype(float)

In [24]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,67,58.499,2020-10-29 08:48:01,fit_null_no specimen,FIT,,Negative,,,,California,Negative
1,200,87.520,2022-11-02 14:39:56,mchc_g/dl_no specimen,Mchc,33.94,,32.9 - 37,g/dL,,Utah,33.94
2,200,87.520,2022-11-02 14:39:56,mch_pg_no specimen,Mch,31.07,,27 - 36,pg,,Utah,31.07
3,200,87.520,2022-11-02 14:39:56,rdw-cv_%_no specimen,RDW-CV,13.70,,11.2 - 14.5,%,,Utah,13.7
4,200,87.520,2022-11-02 14:39:56,hct_%_no specimen,Hct,47.00,,37.1 - 52.5,%,,Utah,47
...,...,...,...,...,...,...,...,...,...,...,...,...
26717,168899,93.716,2016-07-16 02:59:33,sgot (cx)_iu/l_plasma,SGOT (CX),26.00,,10 - 42,IU/L,plasma,Pennsylvania,26
26718,168899,93.716,2016-07-16 02:59:33,egfr_null_plasma,eGFR,93.00,,,,plasma,Pennsylvania,93
26719,168899,93.716,2016-07-16 02:59:33,direct bilirubin (cx)_mg/dl_plasma,DIRECT BILIRUBIN (CX),0.29,,0.0 - 0.2,mg/dL,plasma,Pennsylvania,0.29
26720,168899,93.716,2016-07-16 02:59:33,creatinine (cx)_mg/dl_plasma,CREATININE (CX),0.80,,0.5 - 1.2,mg/dL,plasma,Pennsylvania,0.8


# Round off

In [25]:
lab_train_data['Age at lab test'] = lab_train_data['Age at lab test'].apply(lambda x: round(x,))

In [26]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,67,58,2020-10-29 08:48:01,fit_null_no specimen,FIT,,Negative,,,,California,Negative
1,200,88,2022-11-02 14:39:56,mchc_g/dl_no specimen,Mchc,33.94,,32.9 - 37,g/dL,,Utah,33.94
2,200,88,2022-11-02 14:39:56,mch_pg_no specimen,Mch,31.07,,27 - 36,pg,,Utah,31.07
3,200,88,2022-11-02 14:39:56,rdw-cv_%_no specimen,RDW-CV,13.70,,11.2 - 14.5,%,,Utah,13.7
4,200,88,2022-11-02 14:39:56,hct_%_no specimen,Hct,47.00,,37.1 - 52.5,%,,Utah,47
...,...,...,...,...,...,...,...,...,...,...,...,...
26717,168899,94,2016-07-16 02:59:33,sgot (cx)_iu/l_plasma,SGOT (CX),26.00,,10 - 42,IU/L,plasma,Pennsylvania,26
26718,168899,94,2016-07-16 02:59:33,egfr_null_plasma,eGFR,93.00,,,,plasma,Pennsylvania,93
26719,168899,94,2016-07-16 02:59:33,direct bilirubin (cx)_mg/dl_plasma,DIRECT BILIRUBIN (CX),0.29,,0.0 - 0.2,mg/dL,plasma,Pennsylvania,0.29
26720,168899,94,2016-07-16 02:59:33,creatinine (cx)_mg/dl_plasma,CREATININE (CX),0.80,,0.5 - 1.2,mg/dL,plasma,Pennsylvania,0.8


In [27]:
lab_train_data["Internalpatientid"].nunique()

983

In [28]:
df_3 = lab_train_data["Internalpatientid"].value_counts().loc[lambda x: x>80].to_frame()

In [29]:
df = df_3.reset_index()

In [30]:
df.columns = ["Internalpatientid","count"]

In [31]:
df.drop(["count"],axis=1,inplace =True)

In [32]:
df = df

In [33]:
len(df)

13

In [34]:
# Merge the two dataframes based on 'Internalpatientid'
merged_df = lab_train_data.merge(df, on='Internalpatientid', how='left')

# Filter rows from DataFrame 1 that have matching values in DataFrame 2
filtered_df = merged_df[~merged_df['Internalpatientid'].isin(df['Internalpatientid'])]

# Print only the columns of DataFrame 1
filtered_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,67,58,2020-10-29 08:48:01,fit_null_no specimen,FIT,,Negative,,,,California,Negative
1,200,88,2022-11-02 14:39:56,mchc_g/dl_no specimen,Mchc,33.94,,32.9 - 37,g/dL,,Utah,33.94
2,200,88,2022-11-02 14:39:56,mch_pg_no specimen,Mch,31.07,,27 - 36,pg,,Utah,31.07
3,200,88,2022-11-02 14:39:56,rdw-cv_%_no specimen,RDW-CV,13.70,,11.2 - 14.5,%,,Utah,13.7
4,200,88,2022-11-02 14:39:56,hct_%_no specimen,Hct,47.00,,37.1 - 52.5,%,,Utah,47
...,...,...,...,...,...,...,...,...,...,...,...,...
26717,168899,94,2016-07-16 02:59:33,sgot (cx)_iu/l_plasma,SGOT (CX),26.00,,10 - 42,IU/L,plasma,Pennsylvania,26
26718,168899,94,2016-07-16 02:59:33,egfr_null_plasma,eGFR,93.00,,,,plasma,Pennsylvania,93
26719,168899,94,2016-07-16 02:59:33,direct bilirubin (cx)_mg/dl_plasma,DIRECT BILIRUBIN (CX),0.29,,0.0 - 0.2,mg/dL,plasma,Pennsylvania,0.29
26720,168899,94,2016-07-16 02:59:33,creatinine (cx)_mg/dl_plasma,CREATININE (CX),0.80,,0.5 - 1.2,mg/dL,plasma,Pennsylvania,0.8


In [35]:
lab_train_data = filtered_df 

In [36]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,67,58,2020-10-29 08:48:01,fit_null_no specimen,FIT,,Negative,,,,California,Negative
1,200,88,2022-11-02 14:39:56,mchc_g/dl_no specimen,Mchc,33.94,,32.9 - 37,g/dL,,Utah,33.94
2,200,88,2022-11-02 14:39:56,mch_pg_no specimen,Mch,31.07,,27 - 36,pg,,Utah,31.07
3,200,88,2022-11-02 14:39:56,rdw-cv_%_no specimen,RDW-CV,13.70,,11.2 - 14.5,%,,Utah,13.7
4,200,88,2022-11-02 14:39:56,hct_%_no specimen,Hct,47.00,,37.1 - 52.5,%,,Utah,47
...,...,...,...,...,...,...,...,...,...,...,...,...
26717,168899,94,2016-07-16 02:59:33,sgot (cx)_iu/l_plasma,SGOT (CX),26.00,,10 - 42,IU/L,plasma,Pennsylvania,26
26718,168899,94,2016-07-16 02:59:33,egfr_null_plasma,eGFR,93.00,,,,plasma,Pennsylvania,93
26719,168899,94,2016-07-16 02:59:33,direct bilirubin (cx)_mg/dl_plasma,DIRECT BILIRUBIN (CX),0.29,,0.0 - 0.2,mg/dL,plasma,Pennsylvania,0.29
26720,168899,94,2016-07-16 02:59:33,creatinine (cx)_mg/dl_plasma,CREATININE (CX),0.80,,0.5 - 1.2,mg/dL,plasma,Pennsylvania,0.8


In [37]:
lab_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25447 entries, 0 to 26721
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Internalpatientid     25447 non-null  int64         
 1   Age at lab test       25447 non-null  int64         
 2   Lab test date         25447 non-null  datetime64[ns]
 3   Lab test              25447 non-null  object        
 4   Lab test description  25447 non-null  object        
 5   Result numeric        22164 non-null  float64       
 6   Result textual        3283 non-null   object        
 7   Result range          20245 non-null  object        
 8   Result units          22462 non-null  object        
 9   Specimen source       15022 non-null  object        
 10  State                 25447 non-null  object        
 11  Result                25447 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(8)
memory usage: 2.5+ M

# Adding  "Age at lab test",'Lab test description' & 'Result (Result numeric & Result textual)' columns with '_'

In [38]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
lab_train_data ['Lab_test_description_Result'] = lab_train_data['Age at lab test'].astype(str) + '_' +  lab_train_data['Lab test description'] + '_' +  lab_train_data['Result'].astype(str)

In [39]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result,Lab_test_description_Result
0,67,58,2020-10-29 08:48:01,fit_null_no specimen,FIT,,Negative,,,,California,Negative,58_FIT_Negative
1,200,88,2022-11-02 14:39:56,mchc_g/dl_no specimen,Mchc,33.94,,32.9 - 37,g/dL,,Utah,33.94,88_Mchc_33.94
2,200,88,2022-11-02 14:39:56,mch_pg_no specimen,Mch,31.07,,27 - 36,pg,,Utah,31.07,88_Mch_31.07
3,200,88,2022-11-02 14:39:56,rdw-cv_%_no specimen,RDW-CV,13.70,,11.2 - 14.5,%,,Utah,13.7,88_RDW-CV_13.7
4,200,88,2022-11-02 14:39:56,hct_%_no specimen,Hct,47.00,,37.1 - 52.5,%,,Utah,47,88_Hct_47.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26717,168899,94,2016-07-16 02:59:33,sgot (cx)_iu/l_plasma,SGOT (CX),26.00,,10 - 42,IU/L,plasma,Pennsylvania,26,94_SGOT (CX)_26.0
26718,168899,94,2016-07-16 02:59:33,egfr_null_plasma,eGFR,93.00,,,,plasma,Pennsylvania,93,94_eGFR_93.0
26719,168899,94,2016-07-16 02:59:33,direct bilirubin (cx)_mg/dl_plasma,DIRECT BILIRUBIN (CX),0.29,,0.0 - 0.2,mg/dL,plasma,Pennsylvania,0.29,94_DIRECT BILIRUBIN (CX)_0.29
26720,168899,94,2016-07-16 02:59:33,creatinine (cx)_mg/dl_plasma,CREATININE (CX),0.80,,0.5 - 1.2,mg/dL,plasma,Pennsylvania,0.8,94_CREATININE (CX)_0.8


In [40]:
lab_train_data["Lab_test_description_Result"].values[0] # checking the one row

'58_FIT_Negative'

# Dropping all unnecessary columns

In [41]:
# Remove columns 
lab_train_data.drop(["Age at lab test","Lab test date","Lab test","Lab test description","Result numeric","Result textual",
                     "Result range","Result units","Specimen source","State","Result"], axis=1,inplace=True)

In [42]:
lab_train_data = lab_train_data

In [43]:
lab_train_data # After removing the unwanted column

Unnamed: 0,Internalpatientid,Lab_test_description_Result
0,67,58_FIT_Negative
1,200,88_Mchc_33.94
2,200,88_Mch_31.07
3,200,88_RDW-CV_13.7
4,200,88_Hct_47.0
...,...,...
26717,168899,94_SGOT (CX)_26.0
26718,168899,94_eGFR_93.0
26719,168899,94_DIRECT BILIRUBIN (CX)_0.29
26720,168899,94_CREATININE (CX)_0.8


In [45]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = lab_train_data.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_lab = df_grouped.reset_index()
df_grouped_lab.head()

Unnamed: 0,Internalpatientid,Lab_test_description_Result
0,67,58_FIT_Negative
1,200,"88_Mchc_33.94,88_Mch_31.07,88_RDW-CV_13.7,88_H..."
2,291,83_FINGERSTICK GLUCOSE_75.0
3,330,"75_SODIUM_140.0,75_CHLORIDE_104.0,75_POTASSIUM..."
4,351,"86_MCHC_33.93,86_Mpv_7.11,86_RDW_18.65,86_Hgb_..."


In [48]:
df_grouped_lab[df_grouped_lab["Internalpatientid"]==67]

Unnamed: 0,Internalpatientid,Lab_test_description_Result
0,67,58_FIT_Negative


In [48]:
# df_grouped_lab["Lab_test_description_Result"].values[33632] # checking the one row

'91_SODIUM_133.0,91_LDL CALCULATION_106.0,91_ALBUMIN_3.73,91_POTASSIUM_4.04,91_GLUCOSE_106.0,91_CREATININE_1.0,91_TRIGLYCERIDE_235.0,91_CHOLESTEROL_214.0,91_BILIRUBIN, TOTAL_0.68,91_ALANINE AMINOTRANSFERASE_12.0,91_CARBON DIOXIDE_20.0,91_Anion gap_11.0,91_TSH_1.94,91_CHLORIDE_106.0,91_DIRECT HDL_53.0,91_eGFR_52.0,91_CALCIUM_9.44,91_UREA NITROGEN_26.0,91_TOTAL PROTEIN_7.79,91_ALKALINE PHOSPHATASE_52.0,91_ASPARTATE AMINOTRANSFERASE_16.0,91_NEUT %_57.85,91_Lymph %_29.3,91_LYMPH #_1.5,91_RDW_15.98,91_MCHC_32.34,91_RBC_4.09,91_NEUT #_3.0,91_BASO #_0.0,91_MONO #_0.3,91_MPV_10.08,91_EOS #_0.29,91_WBC_5.0,91_Eos %_5.29,91_Mono %_5.27,91_Hct_41.75,91_MCH_32.18,91_Baso %_0.52,91_PLATELET_191.0,91_MCV_98.7,91_Hgb_13.9'

# Saving the CSV file

In [49]:
# saving the csv file 

#df.to_csv('Path where you want to store the exported CSV file/File Name.csv')
df_grouped_lab.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_quality/df_lab_results_qual_with_age_v1.csv')

In [53]:
import os
cwd = os.getcwd()
cwd

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/2211575'

In [55]:
# df1 = pd.read_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_train/df_lab_results_train.csv')