# **Azure notebook Setup**

In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
#['data_team3_synthetic_train']
datastore = workspace.datastores['data_team3_synthetic_train'] 

In [3]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'lab_results_train.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [52]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
lab_train_data = dataset.to_pandas_dataframe()

In [53]:
lab_train_data

Unnamed: 0,Column1,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State
0,5,5166,79.946327,2015-12-10 09:11:04,troponin i (d/c'd 02/02/2014)_ng/ml_plasma,TROPONIN I (d/c'd 02/02/2014),0.020761,,0.01 - 0.04,ng/ml,plasma,Texas
1,6,5166,81.160335,2017-02-26 02:27:18,calcium_mg/dl_plasma,calcium,9.759563,,8.7 - 10.5,mg/dl,plasma,Texas
2,7,5166,81.160335,2017-02-26 02:27:18,sodium_mmol/l_plasma,SODIUM,130.000000,,136 - 145,mmol/L,plasma,Texas
3,8,5166,81.160335,2017-02-26 02:27:18,creatinine_mg/dl_plasma,CREATININE,1.397398,,0.6 - 1.3,mg/dl,plasma,Texas
4,9,5166,81.160335,2017-02-26 02:27:18,"protein,total_g/dl_plasma","PROTEIN,TOTAL",7.232604,,6.2 - 8.0,g/dl,plasma,Texas
...,...,...,...,...,...,...,...,...,...,...,...,...
258527310,328853891,51662,53.534373,2012-05-07 19:31:20,mono #_k/cu mm_no specimen,MONO #,0.479895,,.1 - 1,K/cu mm,,Texas
258527311,328853892,51662,53.534373,2012-05-07 19:31:20,wbc_k/cu mm_blood,WBC,8.645974,,4 - 11,K/cu mm,blood,Texas
258527312,328853893,51662,53.534373,2012-05-07 19:31:20,mch_pg_blood,MCH,29.000000,,27 - 34,pg,blood,Texas
258527313,328853894,51662,53.534373,2012-05-07 19:31:20,rdw_%_blood,RDW,12.491520,,11 - 16,%,blood,Texas


In [54]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

# Set the maximum number of rows to be displayed to 1000
pd.set_option('display.max_rows', 1163)

# Elimination of the unnamed columns

In [55]:
# Remove unwanted columns 
lab_train_data.drop('Column1',axis=1,inplace=True)

In [57]:
lab_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258527315 entries, 0 to 258527314
Data columns (total 11 columns):
 #   Column                Dtype         
---  ------                -----         
 0   Internalpatientid     int64         
 1   Age at lab test       float64       
 2   Lab test date         datetime64[ns]
 3   Lab test              object        
 4   Lab test description  object        
 5   Result numeric        float64       
 6   Result textual        object        
 7   Result range          object        
 8   Result units          object        
 9   Specimen source       object        
 10  State                 object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(7)
memory usage: 21.2+ GB


# Sorting out patients id and age column

In [58]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
lab_train_data.sort_values(["Internalpatientid","Age at lab test"],inplace=True)

In [59]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State
22564,1,57.839843,2002-02-05 11:06:21,zzhemocult series_null_no specimen,ZZHEMOCULT SERIES,,0/3,,,,Indiana
33601,1,57.912086,2002-03-03 20:49:01,ldl cholesterol - calc_mg/dl_no specimen,LDL CHOLESTEROL - CALC,130.000000,,,mg/dL,,Indiana
33602,1,57.912086,2002-03-03 20:49:01,sgpt_iu/l_serum,SGPT,46.000000,,0 - 35,Iu/L,serum,Indiana
33603,1,57.912086,2002-03-03 20:49:01,hdl cholesterol_mg/dl_no specimen,HDL Cholesterol,53.000000,,35 - 55,mg/dL,,Indiana
33604,1,57.912086,2002-03-03 20:49:01,"cholesterol,total_mg/dl_serum","CHOLESTEROL,TOTAL",213.000000,,,mg/dL,serum,Indiana
...,...,...,...,...,...,...,...,...,...,...,...
118272447,169064,87.867074,2014-10-31 09:58:19,zalpha-1 globulin_%_no specimen,zALPHA-1 GLOBULIN,1.671123,,,%,,Wyoming
118272448,169064,87.867074,2014-10-31 09:58:19,zalbumin (ep)_%_no specimen,zALBUMIN (EP),34.314524,,,%,,Wyoming
118272449,169064,87.867074,2014-10-31 09:58:19,zm-spike_null_no specimen,zM-SPIKE,,NO,,,,Wyoming
118272450,169064,87.867074,2014-10-31 09:58:19,zgamma globulin_%_no specimen,zGAMMA GLOBULIN,10.269399,,,%,,Wyoming


# Round off the "Result numeric" columns with 2 digits

In [60]:
lab_train_data['Result numeric'] = lab_train_data['Result numeric'].apply(lambda x: round(x,2))

In [61]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State
22564,1,57.839843,2002-02-05 11:06:21,zzhemocult series_null_no specimen,ZZHEMOCULT SERIES,,0/3,,,,Indiana
33601,1,57.912086,2002-03-03 20:49:01,ldl cholesterol - calc_mg/dl_no specimen,LDL CHOLESTEROL - CALC,130.00,,,mg/dL,,Indiana
33602,1,57.912086,2002-03-03 20:49:01,sgpt_iu/l_serum,SGPT,46.00,,0 - 35,Iu/L,serum,Indiana
33603,1,57.912086,2002-03-03 20:49:01,hdl cholesterol_mg/dl_no specimen,HDL Cholesterol,53.00,,35 - 55,mg/dL,,Indiana
33604,1,57.912086,2002-03-03 20:49:01,"cholesterol,total_mg/dl_serum","CHOLESTEROL,TOTAL",213.00,,,mg/dL,serum,Indiana
...,...,...,...,...,...,...,...,...,...,...,...
118272447,169064,87.867074,2014-10-31 09:58:19,zalpha-1 globulin_%_no specimen,zALPHA-1 GLOBULIN,1.67,,,%,,Wyoming
118272448,169064,87.867074,2014-10-31 09:58:19,zalbumin (ep)_%_no specimen,zALBUMIN (EP),34.31,,,%,,Wyoming
118272449,169064,87.867074,2014-10-31 09:58:19,zm-spike_null_no specimen,zM-SPIKE,,NO,,,,Wyoming
118272450,169064,87.867074,2014-10-31 09:58:19,zgamma globulin_%_no specimen,zGAMMA GLOBULIN,10.27,,,%,,Wyoming


In [62]:
#checking the missing values
print("Training set missing values:\n", lab_train_data.isna().sum())

Training set missing values:
 Internalpatientid               0
Age at lab test                 0
Lab test date                   0
Lab test                        0
Lab test description            0
Result numeric           32490288
Result textual          226143923
Result range             49021868
Result units             31460687
Specimen source         109735139
State                           0
dtype: int64


* **There is an missing value are presented columns "Result numeric(32490288)","Result textual(226143923)","Result range(49021868)","Result units"(31460687) and "Specimen source(109735139)"**

* **Note : I cross-checked the "Result numeric" and "Result textual" missing values, because some values from the lab test results are mentioned in numerical form and others are mentioned in textual form (wherever numerical lab results are missing that may exist in the textual lab results) If so, you can combine the "Result textual" and "Result numeric."**

# Replace NaN values in "Result numeric" with corresponding values from "Result textual"

In [63]:
# Replace NaN values in "Result numeric" with corresponding values from "Result textual"
lab_train_data['Result'] = lab_train_data['Result numeric'].fillna(lab_train_data['Result textual'])

# Print the updated DataFrame
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
22564,1,57.839843,2002-02-05 11:06:21,zzhemocult series_null_no specimen,ZZHEMOCULT SERIES,,0/3,,,,Indiana,0/3
33601,1,57.912086,2002-03-03 20:49:01,ldl cholesterol - calc_mg/dl_no specimen,LDL CHOLESTEROL - CALC,130.00,,,mg/dL,,Indiana,130
33602,1,57.912086,2002-03-03 20:49:01,sgpt_iu/l_serum,SGPT,46.00,,0 - 35,Iu/L,serum,Indiana,46
33603,1,57.912086,2002-03-03 20:49:01,hdl cholesterol_mg/dl_no specimen,HDL Cholesterol,53.00,,35 - 55,mg/dL,,Indiana,53
33604,1,57.912086,2002-03-03 20:49:01,"cholesterol,total_mg/dl_serum","CHOLESTEROL,TOTAL",213.00,,,mg/dL,serum,Indiana,213
...,...,...,...,...,...,...,...,...,...,...,...,...
118272447,169064,87.867074,2014-10-31 09:58:19,zalpha-1 globulin_%_no specimen,zALPHA-1 GLOBULIN,1.67,,,%,,Wyoming,1.67
118272448,169064,87.867074,2014-10-31 09:58:19,zalbumin (ep)_%_no specimen,zALBUMIN (EP),34.31,,,%,,Wyoming,34.31
118272449,169064,87.867074,2014-10-31 09:58:19,zm-spike_null_no specimen,zM-SPIKE,,NO,,,,Wyoming,NO
118272450,169064,87.867074,2014-10-31 09:58:19,zgamma globulin_%_no specimen,zGAMMA GLOBULIN,10.27,,,%,,Wyoming,10.27


In [64]:
# Check for missing or NaN values in the 'Result' column(before)
missing_data = lab_train_data['Result'].isnull().sum()


if missing_data > 0:
    print(f"There are {missing_data} missing or NaN values in the 'Result' column.")
else:
    print("No missing or NaN values found in the 'Result' column.")

There are 106896 missing or NaN values in the 'Result' column.


* **Since both "Result Numeric" and "Result Textual" are missing, we may really consider that value to be empty in the "Result"**( Actual test results are not available.)

In [65]:
# Remove rows missing values in the "result" column
lab_train_data.dropna(subset=["Result"],inplace=True)

In [66]:
lab_train_data = lab_train_data

In [67]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
22564,1,57.839843,2002-02-05 11:06:21,zzhemocult series_null_no specimen,ZZHEMOCULT SERIES,,0/3,,,,Indiana,0/3
33601,1,57.912086,2002-03-03 20:49:01,ldl cholesterol - calc_mg/dl_no specimen,LDL CHOLESTEROL - CALC,130.00,,,mg/dL,,Indiana,130
33602,1,57.912086,2002-03-03 20:49:01,sgpt_iu/l_serum,SGPT,46.00,,0 - 35,Iu/L,serum,Indiana,46
33603,1,57.912086,2002-03-03 20:49:01,hdl cholesterol_mg/dl_no specimen,HDL Cholesterol,53.00,,35 - 55,mg/dL,,Indiana,53
33604,1,57.912086,2002-03-03 20:49:01,"cholesterol,total_mg/dl_serum","CHOLESTEROL,TOTAL",213.00,,,mg/dL,serum,Indiana,213
...,...,...,...,...,...,...,...,...,...,...,...,...
118272447,169064,87.867074,2014-10-31 09:58:19,zalpha-1 globulin_%_no specimen,zALPHA-1 GLOBULIN,1.67,,,%,,Wyoming,1.67
118272448,169064,87.867074,2014-10-31 09:58:19,zalbumin (ep)_%_no specimen,zALBUMIN (EP),34.31,,,%,,Wyoming,34.31
118272449,169064,87.867074,2014-10-31 09:58:19,zm-spike_null_no specimen,zM-SPIKE,,NO,,,,Wyoming,NO
118272450,169064,87.867074,2014-10-31 09:58:19,zgamma globulin_%_no specimen,zGAMMA GLOBULIN,10.27,,,%,,Wyoming,10.27


In [68]:
# Check for missing or NaN values in the 'Result' column(after removing)
missing_data = lab_train_data['Result'].isnull().sum()


if missing_data > 0:
    print(f"There are {missing_data} missing or NaN values in the 'Result' column.")
else:
    print("No missing or NaN values found in the 'Result' column.")

No missing or NaN values found in the 'Result' column.


* **Missing values has been removed from the "Result" column**

# Taking the digits in age 

In [69]:
# Format 'Age at measurement' values in the format 00.00
lab_train_data["Age at lab test"] =lab_train_data["Age at lab test"].map("{:.3f}".format)

In [70]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
22564,1,57.840,2002-02-05 11:06:21,zzhemocult series_null_no specimen,ZZHEMOCULT SERIES,,0/3,,,,Indiana,0/3
33601,1,57.912,2002-03-03 20:49:01,ldl cholesterol - calc_mg/dl_no specimen,LDL CHOLESTEROL - CALC,130.00,,,mg/dL,,Indiana,130
33602,1,57.912,2002-03-03 20:49:01,sgpt_iu/l_serum,SGPT,46.00,,0 - 35,Iu/L,serum,Indiana,46
33603,1,57.912,2002-03-03 20:49:01,hdl cholesterol_mg/dl_no specimen,HDL Cholesterol,53.00,,35 - 55,mg/dL,,Indiana,53
33604,1,57.912,2002-03-03 20:49:01,"cholesterol,total_mg/dl_serum","CHOLESTEROL,TOTAL",213.00,,,mg/dL,serum,Indiana,213
...,...,...,...,...,...,...,...,...,...,...,...,...
118272447,169064,87.867,2014-10-31 09:58:19,zalpha-1 globulin_%_no specimen,zALPHA-1 GLOBULIN,1.67,,,%,,Wyoming,1.67
118272448,169064,87.867,2014-10-31 09:58:19,zalbumin (ep)_%_no specimen,zALBUMIN (EP),34.31,,,%,,Wyoming,34.31
118272449,169064,87.867,2014-10-31 09:58:19,zm-spike_null_no specimen,zM-SPIKE,,NO,,,,Wyoming,NO
118272450,169064,87.867,2014-10-31 09:58:19,zgamma globulin_%_no specimen,zGAMMA GLOBULIN,10.27,,,%,,Wyoming,10.27


# max age

In [71]:
# Find the maximum age for each internal patient id
max_ages = lab_train_data.groupby('Internalpatientid')['Age at lab test'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
lab_train_data = pd.merge(lab_train_data, max_ages, on =['Internalpatientid','Age at lab test'], how = 'inner')

lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,1,80.208,2024-06-25 01:47:33,alpha fetoprotein_ng/ml_no specimen,Alpha Fetoprotein,3.00,,,ng/mL,,Indiana,3
1,1,80.208,2024-06-25 01:47:34,inr -top_inr_plasma,INR -TOP,1.68,,,INR,plasma,Indiana,1.68
2,1,80.208,2024-06-25 01:47:34,prothrombin time -top_sec._no specimen,PROTHROMBIN TIME -TOP,21.83,,10.3 - 14.7,sec.,,Indiana,21.83
3,1,80.208,2024-06-25 01:47:35,hgb_g/dl_no specimen,HGB,13.77,,13.0 - 17.1,g/dL,,Indiana,13.77
4,1,80.208,2024-06-25 01:47:35,mpv_fl_no specimen,Mpv,10.68,,8.7 - 12.3,fL,,Indiana,10.68
...,...,...,...,...,...,...,...,...,...,...,...,...
3465932,169064,87.867,2014-10-31 09:58:19,zalpha-1 globulin_%_no specimen,zALPHA-1 GLOBULIN,1.67,,,%,,Wyoming,1.67
3465933,169064,87.867,2014-10-31 09:58:19,zalbumin (ep)_%_no specimen,zALBUMIN (EP),34.31,,,%,,Wyoming,34.31
3465934,169064,87.867,2014-10-31 09:58:19,zm-spike_null_no specimen,zM-SPIKE,,NO,,,,Wyoming,NO
3465935,169064,87.867,2014-10-31 09:58:19,zgamma globulin_%_no specimen,zGAMMA GLOBULIN,10.27,,,%,,Wyoming,10.27


In [72]:
# Convert 'Age at measurement' column from object to float
lab_train_data["Age at lab test"] = lab_train_data["Age at lab test"].astype(float)

In [73]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,1,80.208,2024-06-25 01:47:33,alpha fetoprotein_ng/ml_no specimen,Alpha Fetoprotein,3.00,,,ng/mL,,Indiana,3
1,1,80.208,2024-06-25 01:47:34,inr -top_inr_plasma,INR -TOP,1.68,,,INR,plasma,Indiana,1.68
2,1,80.208,2024-06-25 01:47:34,prothrombin time -top_sec._no specimen,PROTHROMBIN TIME -TOP,21.83,,10.3 - 14.7,sec.,,Indiana,21.83
3,1,80.208,2024-06-25 01:47:35,hgb_g/dl_no specimen,HGB,13.77,,13.0 - 17.1,g/dL,,Indiana,13.77
4,1,80.208,2024-06-25 01:47:35,mpv_fl_no specimen,Mpv,10.68,,8.7 - 12.3,fL,,Indiana,10.68
...,...,...,...,...,...,...,...,...,...,...,...,...
3465932,169064,87.867,2014-10-31 09:58:19,zalpha-1 globulin_%_no specimen,zALPHA-1 GLOBULIN,1.67,,,%,,Wyoming,1.67
3465933,169064,87.867,2014-10-31 09:58:19,zalbumin (ep)_%_no specimen,zALBUMIN (EP),34.31,,,%,,Wyoming,34.31
3465934,169064,87.867,2014-10-31 09:58:19,zm-spike_null_no specimen,zM-SPIKE,,NO,,,,Wyoming,NO
3465935,169064,87.867,2014-10-31 09:58:19,zgamma globulin_%_no specimen,zGAMMA GLOBULIN,10.27,,,%,,Wyoming,10.27


# Round off

In [74]:
lab_train_data['Age at lab test'] = lab_train_data['Age at lab test'].apply(lambda x: round(x,))

In [75]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,1,80,2024-06-25 01:47:33,alpha fetoprotein_ng/ml_no specimen,Alpha Fetoprotein,3.00,,,ng/mL,,Indiana,3
1,1,80,2024-06-25 01:47:34,inr -top_inr_plasma,INR -TOP,1.68,,,INR,plasma,Indiana,1.68
2,1,80,2024-06-25 01:47:34,prothrombin time -top_sec._no specimen,PROTHROMBIN TIME -TOP,21.83,,10.3 - 14.7,sec.,,Indiana,21.83
3,1,80,2024-06-25 01:47:35,hgb_g/dl_no specimen,HGB,13.77,,13.0 - 17.1,g/dL,,Indiana,13.77
4,1,80,2024-06-25 01:47:35,mpv_fl_no specimen,Mpv,10.68,,8.7 - 12.3,fL,,Indiana,10.68
...,...,...,...,...,...,...,...,...,...,...,...,...
3465932,169064,88,2014-10-31 09:58:19,zalpha-1 globulin_%_no specimen,zALPHA-1 GLOBULIN,1.67,,,%,,Wyoming,1.67
3465933,169064,88,2014-10-31 09:58:19,zalbumin (ep)_%_no specimen,zALBUMIN (EP),34.31,,,%,,Wyoming,34.31
3465934,169064,88,2014-10-31 09:58:19,zm-spike_null_no specimen,zM-SPIKE,,NO,,,,Wyoming,NO
3465935,169064,88,2014-10-31 09:58:19,zgamma globulin_%_no specimen,zGAMMA GLOBULIN,10.27,,,%,,Wyoming,10.27


In [76]:
lab_train_data["Internalpatientid"].nunique()

130445

In [88]:
df_3 = lab_train_data["Internalpatientid"].value_counts().loc[lambda x: x>80].to_frame()

In [89]:
df = df_3.reset_index()

In [90]:
df.columns = ["Internalpatientid","count"]

In [91]:
df.drop(["count"],axis=1,inplace =True)

In [92]:
df = df

In [93]:
df

Unnamed: 0,Internalpatientid
0,44379
1,81438
2,92012
3,52677
4,36510
...,...
1602,10022
1603,138147
1604,65172
1605,78684


In [94]:
# Merge the two dataframes based on 'Internalpatientid'
merged_df = lab_train_data.merge(df, on='Internalpatientid', how='left')

# Filter rows from DataFrame 1 that have matching values in DataFrame 2
filtered_df = merged_df[~merged_df['Internalpatientid'].isin(df['Internalpatientid'])]

# Print only the columns of DataFrame 1
filtered_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,1,80,2024-06-25 01:47:33,alpha fetoprotein_ng/ml_no specimen,Alpha Fetoprotein,3.00,,,ng/mL,,Indiana,3
1,1,80,2024-06-25 01:47:34,inr -top_inr_plasma,INR -TOP,1.68,,,INR,plasma,Indiana,1.68
2,1,80,2024-06-25 01:47:34,prothrombin time -top_sec._no specimen,PROTHROMBIN TIME -TOP,21.83,,10.3 - 14.7,sec.,,Indiana,21.83
3,1,80,2024-06-25 01:47:35,hgb_g/dl_no specimen,HGB,13.77,,13.0 - 17.1,g/dL,,Indiana,13.77
4,1,80,2024-06-25 01:47:35,mpv_fl_no specimen,Mpv,10.68,,8.7 - 12.3,fL,,Indiana,10.68
...,...,...,...,...,...,...,...,...,...,...,...,...
3465932,169064,88,2014-10-31 09:58:19,zalpha-1 globulin_%_no specimen,zALPHA-1 GLOBULIN,1.67,,,%,,Wyoming,1.67
3465933,169064,88,2014-10-31 09:58:19,zalbumin (ep)_%_no specimen,zALBUMIN (EP),34.31,,,%,,Wyoming,34.31
3465934,169064,88,2014-10-31 09:58:19,zm-spike_null_no specimen,zM-SPIKE,,NO,,,,Wyoming,NO
3465935,169064,88,2014-10-31 09:58:19,zgamma globulin_%_no specimen,zGAMMA GLOBULIN,10.27,,,%,,Wyoming,10.27


In [95]:
lab_train_data = filtered_df 

In [96]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result
0,1,80,2024-06-25 01:47:33,alpha fetoprotein_ng/ml_no specimen,Alpha Fetoprotein,3.00,,,ng/mL,,Indiana,3
1,1,80,2024-06-25 01:47:34,inr -top_inr_plasma,INR -TOP,1.68,,,INR,plasma,Indiana,1.68
2,1,80,2024-06-25 01:47:34,prothrombin time -top_sec._no specimen,PROTHROMBIN TIME -TOP,21.83,,10.3 - 14.7,sec.,,Indiana,21.83
3,1,80,2024-06-25 01:47:35,hgb_g/dl_no specimen,HGB,13.77,,13.0 - 17.1,g/dL,,Indiana,13.77
4,1,80,2024-06-25 01:47:35,mpv_fl_no specimen,Mpv,10.68,,8.7 - 12.3,fL,,Indiana,10.68
...,...,...,...,...,...,...,...,...,...,...,...,...
3465932,169064,88,2014-10-31 09:58:19,zalpha-1 globulin_%_no specimen,zALPHA-1 GLOBULIN,1.67,,,%,,Wyoming,1.67
3465933,169064,88,2014-10-31 09:58:19,zalbumin (ep)_%_no specimen,zALBUMIN (EP),34.31,,,%,,Wyoming,34.31
3465934,169064,88,2014-10-31 09:58:19,zm-spike_null_no specimen,zM-SPIKE,,NO,,,,Wyoming,NO
3465935,169064,88,2014-10-31 09:58:19,zgamma globulin_%_no specimen,zGAMMA GLOBULIN,10.27,,,%,,Wyoming,10.27


In [97]:
lab_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3314277 entries, 0 to 3465936
Data columns (total 12 columns):
 #   Column                Dtype         
---  ------                -----         
 0   Internalpatientid     int64         
 1   Age at lab test       int64         
 2   Lab test date         datetime64[ns]
 3   Lab test              object        
 4   Lab test description  object        
 5   Result numeric        float64       
 6   Result textual        object        
 7   Result range          object        
 8   Result units          object        
 9   Specimen source       object        
 10  State                 object        
 11  Result                object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(8)
memory usage: 328.7+ MB


In [41]:
# lab_train_data['Result'] = lab_train_data['Result'].astype(float)

ValueError: could not convert string to float: 'DONE'

# Adding  "Age at lab test",'Lab test description' & 'Result (Result numeric & Result textual)' columns with '_'

In [98]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
lab_train_data ['Lab_test_description_Result'] = lab_train_data['Age at lab test'].astype(str) + '_' +  lab_train_data['Lab test description'] + '_' +  lab_train_data['Result'].astype(str)

In [99]:
lab_train_data

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Lab test,Lab test description,Result numeric,Result textual,Result range,Result units,Specimen source,State,Result,Lab_test_description_Result
0,1,80,2024-06-25 01:47:33,alpha fetoprotein_ng/ml_no specimen,Alpha Fetoprotein,3.00,,,ng/mL,,Indiana,3,80_Alpha Fetoprotein_3.0
1,1,80,2024-06-25 01:47:34,inr -top_inr_plasma,INR -TOP,1.68,,,INR,plasma,Indiana,1.68,80_INR -TOP_1.68
2,1,80,2024-06-25 01:47:34,prothrombin time -top_sec._no specimen,PROTHROMBIN TIME -TOP,21.83,,10.3 - 14.7,sec.,,Indiana,21.83,80_PROTHROMBIN TIME -TOP_21.83
3,1,80,2024-06-25 01:47:35,hgb_g/dl_no specimen,HGB,13.77,,13.0 - 17.1,g/dL,,Indiana,13.77,80_HGB_13.77
4,1,80,2024-06-25 01:47:35,mpv_fl_no specimen,Mpv,10.68,,8.7 - 12.3,fL,,Indiana,10.68,80_Mpv_10.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3465932,169064,88,2014-10-31 09:58:19,zalpha-1 globulin_%_no specimen,zALPHA-1 GLOBULIN,1.67,,,%,,Wyoming,1.67,88_zALPHA-1 GLOBULIN_1.67
3465933,169064,88,2014-10-31 09:58:19,zalbumin (ep)_%_no specimen,zALBUMIN (EP),34.31,,,%,,Wyoming,34.31,88_zALBUMIN (EP)_34.31
3465934,169064,88,2014-10-31 09:58:19,zm-spike_null_no specimen,zM-SPIKE,,NO,,,,Wyoming,NO,88_zM-SPIKE_NO
3465935,169064,88,2014-10-31 09:58:19,zgamma globulin_%_no specimen,zGAMMA GLOBULIN,10.27,,,%,,Wyoming,10.27,88_zGAMMA GLOBULIN_10.27


In [100]:
lab_train_data["Lab_test_description_Result"].values[0] # checking the one row

'80_Alpha Fetoprotein_3.0'

# Dropping all unnecessary columns

In [101]:
# Remove columns 
lab_train_data.drop(["Age at lab test","Lab test date","Lab test","Lab test description","Result numeric","Result textual",
                     "Result range","Result units","Specimen source","State","Result"], axis=1,inplace=True)

In [102]:
lab_train_data = lab_train_data

In [103]:
lab_train_data # After removing the unwanted column

Unnamed: 0,Internalpatientid,Lab_test_description_Result
0,1,80_Alpha Fetoprotein_3.0
1,1,80_INR -TOP_1.68
2,1,80_PROTHROMBIN TIME -TOP_21.83
3,1,80_HGB_13.77
4,1,80_Mpv_10.68
...,...,...
3465932,169064,88_zALPHA-1 GLOBULIN_1.67
3465933,169064,88_zALBUMIN (EP)_34.31
3465934,169064,88_zM-SPIKE_NO
3465935,169064,88_zGAMMA GLOBULIN_10.27


In [104]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = lab_train_data.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_lab = df_grouped.reset_index()
df_grouped_lab

Unnamed: 0,Internalpatientid,Lab_test_description_Result
0,1,"80_Alpha Fetoprotein_3.0,80_INR -TOP_1.68,80_P..."
1,2,"70_GLOBULIN, CALC._3.0,70_OSMOLALITY, CALC._30..."
2,3,82_WHOLE BLOOD GLUCOSE_123.0
3,4,"84_MRSA SURVL NARES AGAR_NEGATIVE,84_ANCILLARY..."
4,5,"77_PROTEIN, TOTAL_5.13,77_eGFR_49.0,77_POTASSI..."
...,...,...
128833,169057,"86_POTASSIUM_3.77,86_UREA NITROGEN_32.0,86_SOD..."
128834,169060,"72_BILIRUBIN, DIRECT_0.7,72_SGPT_13.0,72_SODIU..."
128835,169062,"74_URINE WBC/HPF_36.0,74_URINE BLOOD_TR,74_URI..."
128836,169063,"76_PLT (V2)_140.0,76_HGB (V2)_10.31,76_MPV (V2..."


In [105]:
df_grouped_lab[df_grouped_lab["Internalpatientid"]==1]

Unnamed: 0,Internalpatientid,Lab_test_description_Result
0,1,"80_Alpha Fetoprotein_3.0,80_INR -TOP_1.68,80_P..."


In [106]:
df_grouped_lab["Lab_test_description_Result"].values[128837] # checking the one row

'88_zBETA GLOBULIN_39.0,88_PROTEIN,TOTAL(SENDOUT)_225.25,88_zALPHA-1 GLOBULIN_1.67,88_zALBUMIN (EP)_34.31,88_zM-SPIKE_NO,88_zGAMMA GLOBULIN_10.27,88_zALPHA-2 GLOBULIN_13.78'

# Saving the CSV file

In [107]:
# saving the csv file 

#df.to_csv('Path where you want to store the exported CSV file/File Name.csv')
df_grouped_lab.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_train/df_lab_results_train_with_age_v1.csv')

In [53]:
import os
cwd = os.getcwd()
cwd

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/2211575'

In [55]:
# df1 = pd.read_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_train/df_lab_results_train.csv')