In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
#['data_team3_synthetic_train']
datastore = workspace.datastores['data_team3_synthetic_quality_check'] 

In [3]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'conditions_qual.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [4]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
conditions_train_data= dataset.to_pandas_dataframe()

In [5]:
conditions_train_data

Unnamed: 0,Column1,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,587,100012,52.022196,2001-07-09 02:42:14,Diagnosis,Encounter for attention to tracheostomy,P,Texas
1,588,100012,52.100619,2001-08-06 18:37:42,Diagnosis,Essential (primary) hypertension,P,Texas
2,589,100012,52.100619,2001-08-06 18:37:42,Diagnosis,Benign prostatic hyperplasia,S,Texas
3,590,100012,52.337453,2001-11-01 08:08:08,Diagnosis,Encounter for examination and observation for ...,P,Texas
4,591,100012,52.594739,2002-02-03 09:02:55,Diagnosis,"Encounter for administrative examinations, uns...",P,New Mexico
...,...,...,...,...,...,...,...,...
655303,112203098,99944,72.099955,2004-08-12 12:48:10,Diagnosis,Essential (primary) hypertension,P,Massachusetts
655304,112203099,99944,77.935663,2010-06-15 11:38:02,Problem,Calculus of kidney,,Massachusetts
655305,112203100,99944,78.210942,2010-09-24 02:23:05,Diagnosis,Not specified,P,Massachusetts
655306,112203101,99944,80.438034,2012-12-16 02:26:00,Diagnosis,"Heart failure, unspecified",P,Massachusetts


In [6]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

# Set the maximum number of rows to be displayed to 1000
pd.set_option('display.max_rows', 1163)

# **Elimination of the unnamed columns**

In [7]:
# Remove unwanted columns 
conditions_train_data.drop('Column1',axis=1,inplace=True)

In [8]:
conditions_train_data =  conditions_train_data

# sorting patients id and age column

In [9]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
conditions_train_data.sort_values(["Internalpatientid","Age at condition documentation"],inplace=True)

# Checking the missing value

In [10]:
print("Training set missing values:\n", conditions_train_data.isna().sum())

Training set missing values:
 Internalpatientid                        0
Age at condition documentation           0
Condition documented date                0
Condition type                           0
Condition code icd10 subcategory         0
Diagnosis sequence number or rank    27666
State                                    0
dtype: int64


# Taking the digits 

In [11]:
# Format 'Age at measurement' values in the format 00.00
conditions_train_data["Age at condition documentation"] = conditions_train_data["Age at condition documentation"].map("{:.2f}".format)

In [12]:
conditions_train_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
459824,67,48.28,2010-08-09 14:59:14,Problem,Essential (primary) hypertension,,California
459825,67,48.28,2010-08-09 14:59:14,Problem,"Counseling, unspecified",,California
459826,67,48.28,2010-08-09 14:59:14,Problem,Personal history of diseases of the circulator...,,California
459827,67,48.28,2010-08-09 14:59:14,Problem,Essential (primary) hypertension,,California
459828,67,48.28,2010-08-09 14:59:14,Problem,"Chronic kidney disease, stage 2 (mild)",,California
...,...,...,...,...,...,...,...
525272,168899,96.87,2019-09-13 03:48:33,Diagnosis,Long term (current) use of anticoagulants and ...,S,Pennsylvania
525273,168899,96.87,2019-09-13 03:48:33,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Pennsylvania
255148,168899,96.91,2019-09-24 22:46:28,Diagnosis,Long term (current) use of anticoagulants and ...,S,Pennsylvania
255149,168899,96.91,2019-09-24 22:46:28,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Pennsylvania


In [13]:
# Convert 'Age at measurement' column from object to float
conditions_train_data["Age at condition documentation"] = conditions_train_data["Age at condition documentation"].astype(float)

In [14]:
conditions_train_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
459824,67,48.28,2010-08-09 14:59:14,Problem,Essential (primary) hypertension,,California
459825,67,48.28,2010-08-09 14:59:14,Problem,"Counseling, unspecified",,California
459826,67,48.28,2010-08-09 14:59:14,Problem,Personal history of diseases of the circulator...,,California
459827,67,48.28,2010-08-09 14:59:14,Problem,Essential (primary) hypertension,,California
459828,67,48.28,2010-08-09 14:59:14,Problem,"Chronic kidney disease, stage 2 (mild)",,California
...,...,...,...,...,...,...,...
525272,168899,96.87,2019-09-13 03:48:33,Diagnosis,Long term (current) use of anticoagulants and ...,S,Pennsylvania
525273,168899,96.87,2019-09-13 03:48:33,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Pennsylvania
255148,168899,96.91,2019-09-24 22:46:28,Diagnosis,Long term (current) use of anticoagulants and ...,S,Pennsylvania
255149,168899,96.91,2019-09-24 22:46:28,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Pennsylvania


# Max age

In [15]:
# Find the maximum age for each internal patient id
max_ages = conditions_train_data.groupby('Internalpatientid')['Age at condition documentation'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
conditions_train_data = pd.merge(conditions_train_data, max_ages, on =['Internalpatientid','Age at condition documentation'], how = 'inner')

conditions_train_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,67,58.48,2020-10-22 11:49:50,Diagnosis,Encounter for immunization,P,California
1,200,87.80,2023-02-13 10:31:19,Diagnosis,Vascular dementia,P,Utah
2,291,83.18,2006-07-08 12:40:03,Diagnosis,Secondary and unspecified malignant neoplasm o...,4,Delaware
3,291,83.18,2006-07-08 12:40:03,Diagnosis,"Pneumonia, unspecified organism",6,Delaware
4,291,83.18,2006-07-08 12:40:03,Diagnosis,Encounter for antineoplastic radiation therapy,1,Delaware
...,...,...,...,...,...,...,...
6507,168496,98.97,2023-02-03 01:11:44,Diagnosis,Not specified,P,California
6508,168496,98.97,2023-02-03 21:37:36,Diagnosis,Not specified,P,California
6509,168899,96.91,2019-09-24 22:46:28,Diagnosis,Long term (current) use of anticoagulants and ...,S,Pennsylvania
6510,168899,96.91,2019-09-24 22:46:28,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Pennsylvania


# Rounding off the Age 

In [16]:
conditions_train_data['Age at condition documentation'] = conditions_train_data['Age at condition documentation'].apply(lambda x: round(x,))

In [17]:
conditions_train_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,67,58,2020-10-22 11:49:50,Diagnosis,Encounter for immunization,P,California
1,200,88,2023-02-13 10:31:19,Diagnosis,Vascular dementia,P,Utah
2,291,83,2006-07-08 12:40:03,Diagnosis,Secondary and unspecified malignant neoplasm o...,4,Delaware
3,291,83,2006-07-08 12:40:03,Diagnosis,"Pneumonia, unspecified organism",6,Delaware
4,291,83,2006-07-08 12:40:03,Diagnosis,Encounter for antineoplastic radiation therapy,1,Delaware
...,...,...,...,...,...,...,...
6507,168496,99,2023-02-03 01:11:44,Diagnosis,Not specified,P,California
6508,168496,99,2023-02-03 21:37:36,Diagnosis,Not specified,P,California
6509,168899,97,2019-09-24 22:46:28,Diagnosis,Long term (current) use of anticoagulants and ...,S,Pennsylvania
6510,168899,97,2019-09-24 22:46:28,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Pennsylvania


In [18]:
# value counts
df = conditions_train_data["Internalpatientid"].value_counts().loc[lambda x: x  > 60]

In [19]:
df = df.reset_index() 

In [20]:
df.columns = ["Internalpatientid","count_conditions"]

In [21]:
df.drop('count_conditions', axis = 1, inplace = True)

In [22]:
df

Unnamed: 0,Internalpatientid
0,149757
1,88071
2,34287
3,128935
4,100314
5,100229
6,163951
7,26517
8,16325
9,132010


In [23]:
# Merge the two dataframes based on 'Internalpatientid'
merged_df = conditions_train_data.merge(df, on='Internalpatientid', how='left')

# Filter rows from DataFrame 1 that have matching values in DataFrame 2
filtered_df = merged_df[~merged_df['Internalpatientid'].isin(df['Internalpatientid'])]

# Print only the columns of DataFrame 1
filtered_df

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,67,58,2020-10-22 11:49:50,Diagnosis,Encounter for immunization,P,California
1,200,88,2023-02-13 10:31:19,Diagnosis,Vascular dementia,P,Utah
2,291,83,2006-07-08 12:40:03,Diagnosis,Secondary and unspecified malignant neoplasm o...,4,Delaware
3,291,83,2006-07-08 12:40:03,Diagnosis,"Pneumonia, unspecified organism",6,Delaware
4,291,83,2006-07-08 12:40:03,Diagnosis,Encounter for antineoplastic radiation therapy,1,Delaware
...,...,...,...,...,...,...,...
6507,168496,99,2023-02-03 01:11:44,Diagnosis,Not specified,P,California
6508,168496,99,2023-02-03 21:37:36,Diagnosis,Not specified,P,California
6509,168899,97,2019-09-24 22:46:28,Diagnosis,Long term (current) use of anticoagulants and ...,S,Pennsylvania
6510,168899,97,2019-09-24 22:46:28,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Pennsylvania


In [24]:
df_new = filtered_df 

# Adding  'Age at condition documentation' & 'Condition code icd10 subcategory' columns with '_'

In [25]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
df_new['Condition_code_icd10_subcategory'] =  df_new['Age at condition documentation'].astype(str) + '_' +  df_new['Condition code icd10 subcategory']

In [26]:
df_new

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State,Condition_code_icd10_subcategory
0,67,58,2020-10-22 11:49:50,Diagnosis,Encounter for immunization,P,California,58_Encounter for immunization
1,200,88,2023-02-13 10:31:19,Diagnosis,Vascular dementia,P,Utah,88_Vascular dementia
2,291,83,2006-07-08 12:40:03,Diagnosis,Secondary and unspecified malignant neoplasm o...,4,Delaware,83_Secondary and unspecified malignant neoplas...
3,291,83,2006-07-08 12:40:03,Diagnosis,"Pneumonia, unspecified organism",6,Delaware,"83_Pneumonia, unspecified organism"
4,291,83,2006-07-08 12:40:03,Diagnosis,Encounter for antineoplastic radiation therapy,1,Delaware,83_Encounter for antineoplastic radiation therapy
...,...,...,...,...,...,...,...,...
6507,168496,99,2023-02-03 01:11:44,Diagnosis,Not specified,P,California,99_Not specified
6508,168496,99,2023-02-03 21:37:36,Diagnosis,Not specified,P,California,99_Not specified
6509,168899,97,2019-09-24 22:46:28,Diagnosis,Long term (current) use of anticoagulants and ...,S,Pennsylvania,97_Long term (current) use of anticoagulants a...
6510,168899,97,2019-09-24 22:46:28,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Pennsylvania,97_Unspecified atrial fibrillation and atrial ...


# Dropping all unnecessary columns

In [27]:
# Remove columns 
df_new.drop(["Age at condition documentation","Condition documented date","Condition type",
                           "Condition code icd10 subcategory","Diagnosis sequence number or rank","State"], axis=1,inplace=True)

In [28]:
df_new = df_new

In [29]:
df_new

Unnamed: 0,Internalpatientid,Condition_code_icd10_subcategory
0,67,58_Encounter for immunization
1,200,88_Vascular dementia
2,291,83_Secondary and unspecified malignant neoplas...
3,291,"83_Pneumonia, unspecified organism"
4,291,83_Encounter for antineoplastic radiation therapy
...,...,...
6507,168496,99_Not specified
6508,168496,99_Not specified
6509,168899,97_Long term (current) use of anticoagulants a...
6510,168899,97_Unspecified atrial fibrillation and atrial ...


In [31]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = df_new.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_condition = df_grouped.reset_index()
len(df_grouped_condition)

989

In [32]:
df_grouped_condition.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_quality/df_conditions_quality.csv')