In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
#['data_team3_synthetic_train']
datastore = workspace.datastores['data_team3_synthetic_train'] 

In [3]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'conditions_train.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [95]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
conditions_train_data= dataset.to_pandas_dataframe()

In [96]:
conditions_train_data

Unnamed: 0,Column1,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,0,1,57.912177,2002-03-03 21:37:12,Diagnosis,"Polyosteoarthritis, unspecified ; Osteoarthrit...",S,Indiana
1,1,1,57.912177,2002-03-03 21:37:12,Diagnosis,Essential (primary) hypertension,P,Indiana
2,2,1,57.912177,2002-03-03 21:37:12,Diagnosis,Mixed hyperlipidemia,S,Indiana
3,3,1,58.636284,2002-11-23 13:29:02,Diagnosis,Mixed hyperlipidemia,S,Indiana
4,4,1,58.636284,2002-11-23 13:29:02,Diagnosis,Essential (primary) hypertension,P,Indiana
...,...,...,...,...,...,...,...,...
88266466,112205241,99999,96.300251,2013-03-19 17:47:55,Diagnosis,Other specified counseling,P,Arizona
88266467,112205242,99999,96.306231,2013-03-21 22:15:17,Diagnosis,End stage renal disease,P,Arizona
88266468,112205243,99999,96.313747,2013-03-24 16:11:04,Diagnosis,End stage renal disease,P,Arizona
88266469,112205244,99999,96.363722,2013-04-11 22:34:16,Diagnosis,End stage renal disease,P,Arizona


In [97]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

# Set the maximum number of rows to be displayed to 1000
pd.set_option('display.max_rows', 1163)

# **Elimination of the unnamed columns**

In [98]:
# Remove unwanted columns 
conditions_train_data.drop('Column1',axis=1,inplace=True)

In [99]:
conditions_train_data =  conditions_train_data

# sorting patients id and age column

In [100]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
conditions_train_data.sort_values(["Internalpatientid","Age at condition documentation"],inplace=True)

# Checking the missing value

In [101]:
print("Training set missing values:\n", conditions_train_data.isna().sum())

Training set missing values:
 Internalpatientid                          0
Age at condition documentation             0
Condition documented date                  0
Condition type                             0
Condition code icd10 subcategory           0
Diagnosis sequence number or rank    3648777
State                                      0
dtype: int64


# Taking the digits 

In [102]:
# Format 'Age at measurement' values in the format 00.00
conditions_train_data["Age at condition documentation"] = conditions_train_data["Age at condition documentation"].map("{:.2f}".format)

In [103]:
conditions_train_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
36509,1,57.57,2001-10-30 23:22:22,Diagnosis,Encounter for immunization,S,Indiana
36510,1,57.57,2001-10-30 23:22:22,Diagnosis,Dietary counseling and surveillance,S,Indiana
36511,1,57.57,2001-10-30 23:22:22,Diagnosis,Essential (primary) hypertension,P,Indiana
36512,1,57.57,2001-10-30 23:22:22,Diagnosis,"Polyosteoarthritis, unspecified ; Osteoarthrit...",S,Indiana
0,1,57.91,2002-03-03 21:37:12,Diagnosis,"Polyosteoarthritis, unspecified ; Osteoarthrit...",S,Indiana
...,...,...,...,...,...,...,...
72580936,169064,87.87,2014-10-31 11:04:34,Diagnosis,"Osteoarthritis, unspecified site",S,Wyoming
72580937,169064,87.87,2014-10-31 11:04:34,Diagnosis,"Anxiety disorder, unspecified",S,Wyoming
36195444,169064,87.88,2014-11-04 08:16:12,Diagnosis,"Anxiety disorder, unspecified",S,Wyoming
36195445,169064,87.88,2014-11-04 08:16:12,Diagnosis,Disorders of calcium metabolism,P,Wyoming


In [104]:
# Convert 'Age at measurement' column from object to float
conditions_train_data["Age at condition documentation"] = conditions_train_data["Age at condition documentation"].astype(float)

In [105]:
conditions_train_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
36509,1,57.57,2001-10-30 23:22:22,Diagnosis,Encounter for immunization,S,Indiana
36510,1,57.57,2001-10-30 23:22:22,Diagnosis,Dietary counseling and surveillance,S,Indiana
36511,1,57.57,2001-10-30 23:22:22,Diagnosis,Essential (primary) hypertension,P,Indiana
36512,1,57.57,2001-10-30 23:22:22,Diagnosis,"Polyosteoarthritis, unspecified ; Osteoarthrit...",S,Indiana
0,1,57.91,2002-03-03 21:37:12,Diagnosis,"Polyosteoarthritis, unspecified ; Osteoarthrit...",S,Indiana
...,...,...,...,...,...,...,...
72580936,169064,87.87,2014-10-31 11:04:34,Diagnosis,"Osteoarthritis, unspecified site",S,Wyoming
72580937,169064,87.87,2014-10-31 11:04:34,Diagnosis,"Anxiety disorder, unspecified",S,Wyoming
36195444,169064,87.88,2014-11-04 08:16:12,Diagnosis,"Anxiety disorder, unspecified",S,Wyoming
36195445,169064,87.88,2014-11-04 08:16:12,Diagnosis,Disorders of calcium metabolism,P,Wyoming


# Max age

In [106]:
# Find the maximum age for each internal patient id
max_ages = conditions_train_data.groupby('Internalpatientid')['Age at condition documentation'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
conditions_train_data = pd.merge(conditions_train_data, max_ages, on =['Internalpatientid','Age at condition documentation'], how = 'inner')

conditions_train_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,1,80.21,2024-06-25 02:44:48,Diagnosis,"Hyperlipidemia, unspecified",S,Indiana
1,1,80.21,2024-06-25 02:44:48,Diagnosis,Other and unspecified cirrhosis of liver,S,Indiana
2,1,80.21,2024-06-25 02:44:48,Diagnosis,Other specified inflammatory liver diseases,P,Indiana
3,1,80.21,2024-06-25 02:44:48,Diagnosis,Type 2 diabetes mellitus without complications,S,Indiana
4,1,80.21,2024-06-26 03:20:15,Diagnosis,Unspecified atrial fibrillation and atrial flu...,P,Indiana
...,...,...,...,...,...,...,...
866297,169062,74.18,2005-10-14 22:20:25,Diagnosis,Encounter for palliative care,5,Florida
866298,169062,74.18,2005-10-14 22:20:25,Diagnosis,Melena,6,Florida
866299,169062,74.18,2005-10-14 22:20:25,Diagnosis,Other restrictive cardiomyopathy,3,Florida
866300,169063,78.49,2006-02-04 23:24:49,Diagnosis,Not specified,P,New York


# Rounding off the Age 

In [107]:
conditions_train_data['Age at condition documentation'] = conditions_train_data['Age at condition documentation'].apply(lambda x: round(x,))

In [108]:
conditions_train_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,1,80,2024-06-25 02:44:48,Diagnosis,"Hyperlipidemia, unspecified",S,Indiana
1,1,80,2024-06-25 02:44:48,Diagnosis,Other and unspecified cirrhosis of liver,S,Indiana
2,1,80,2024-06-25 02:44:48,Diagnosis,Other specified inflammatory liver diseases,P,Indiana
3,1,80,2024-06-25 02:44:48,Diagnosis,Type 2 diabetes mellitus without complications,S,Indiana
4,1,80,2024-06-26 03:20:15,Diagnosis,Unspecified atrial fibrillation and atrial flu...,P,Indiana
...,...,...,...,...,...,...,...
866297,169062,74,2005-10-14 22:20:25,Diagnosis,Encounter for palliative care,5,Florida
866298,169062,74,2005-10-14 22:20:25,Diagnosis,Melena,6,Florida
866299,169062,74,2005-10-14 22:20:25,Diagnosis,Other restrictive cardiomyopathy,3,Florida
866300,169063,78,2006-02-04 23:24:49,Diagnosis,Not specified,P,New York


In [109]:
# value counts
df = conditions_train_data["Internalpatientid"].value_counts().loc[lambda x: x  > 60]

In [110]:
df = df.reset_index() 

In [111]:
df.columns = ["Internalpatientid","count_conditions"]

In [112]:
df.drop('count_conditions', axis = 1, inplace = True)

In [113]:
df

Unnamed: 0,Internalpatientid
0,92630
1,3217
2,113648
3,46281
4,106998
...,...
1178,157691
1179,154241
1180,501
1181,72913


In [114]:
# Merge the two dataframes based on 'Internalpatientid'
merged_df = conditions_train_data.merge(df, on='Internalpatientid', how='left')

# Filter rows from DataFrame 1 that have matching values in DataFrame 2
filtered_df = merged_df[~merged_df['Internalpatientid'].isin(df['Internalpatientid'])]

# Print only the columns of DataFrame 1
filtered_df

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,1,80,2024-06-25 02:44:48,Diagnosis,"Hyperlipidemia, unspecified",S,Indiana
1,1,80,2024-06-25 02:44:48,Diagnosis,Other and unspecified cirrhosis of liver,S,Indiana
2,1,80,2024-06-25 02:44:48,Diagnosis,Other specified inflammatory liver diseases,P,Indiana
3,1,80,2024-06-25 02:44:48,Diagnosis,Type 2 diabetes mellitus without complications,S,Indiana
4,1,80,2024-06-26 03:20:15,Diagnosis,Unspecified atrial fibrillation and atrial flu...,P,Indiana
...,...,...,...,...,...,...,...
866297,169062,74,2005-10-14 22:20:25,Diagnosis,Encounter for palliative care,5,Florida
866298,169062,74,2005-10-14 22:20:25,Diagnosis,Melena,6,Florida
866299,169062,74,2005-10-14 22:20:25,Diagnosis,Other restrictive cardiomyopathy,3,Florida
866300,169063,78,2006-02-04 23:24:49,Diagnosis,Not specified,P,New York


In [115]:
df_new = filtered_df

In [116]:
df_new

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,1,80,2024-06-25 02:44:48,Diagnosis,"Hyperlipidemia, unspecified",S,Indiana
1,1,80,2024-06-25 02:44:48,Diagnosis,Other and unspecified cirrhosis of liver,S,Indiana
2,1,80,2024-06-25 02:44:48,Diagnosis,Other specified inflammatory liver diseases,P,Indiana
3,1,80,2024-06-25 02:44:48,Diagnosis,Type 2 diabetes mellitus without complications,S,Indiana
4,1,80,2024-06-26 03:20:15,Diagnosis,Unspecified atrial fibrillation and atrial flu...,P,Indiana
...,...,...,...,...,...,...,...
866297,169062,74,2005-10-14 22:20:25,Diagnosis,Encounter for palliative care,5,Florida
866298,169062,74,2005-10-14 22:20:25,Diagnosis,Melena,6,Florida
866299,169062,74,2005-10-14 22:20:25,Diagnosis,Other restrictive cardiomyopathy,3,Florida
866300,169063,78,2006-02-04 23:24:49,Diagnosis,Not specified,P,New York


# Dropping all unnecessary columns

In [117]:
# # Remove columns 
df_new.drop(["Age at condition documentation","Condition documented date","Condition type","Diagnosis sequence number or rank","State"], axis=1,inplace=True)

In [118]:
df_new = df_new

In [119]:
df_new

Unnamed: 0,Internalpatientid,Condition code icd10 subcategory
0,1,"Hyperlipidemia, unspecified"
1,1,Other and unspecified cirrhosis of liver
2,1,Other specified inflammatory liver diseases
3,1,Type 2 diabetes mellitus without complications
4,1,Unspecified atrial fibrillation and atrial flu...
...,...,...
866297,169062,Encounter for palliative care
866298,169062,Melena
866299,169062,Other restrictive cardiomyopathy
866300,169063,Not specified


In [120]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = df_new.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_condition = df_grouped.reset_index()
df_grouped_condition

Unnamed: 0,Internalpatientid,Condition code icd10 subcategory
0,1,"Hyperlipidemia, unspecified,Other and unspecif..."
1,2,"Not specified,Fracture of coccyx"
2,3,"Acquired absence of leg below knee,Encounter f..."
3,4,"Unspecified abdominal pain,Systolic (congestiv..."
4,5,Encounter for screening for infectious and par...
...,...,...
132064,169060,Encounter for other specified aftercare
132065,169061,Other specified counseling
132066,169062,"Heart failure, unspecified,Heart failure, unsp..."
132067,169063,Not specified


# Finding the Cv(1) or Non-cv(0)

In [121]:
# Define the list of heart-related words
phrases = ['Systolic','congestive','Heart','Diastolic','Cardiogenic','Hypertensive heart','myocardial','STEMI','NSTEMI',
           'Unstable angina','chronic ischemic heart disease','Atherosclerotic','native coronary artery',
           'Atrial fibrillation','Atrial flutter','Supraventricular tachycardia','Ventricular tachycardia']

In [122]:
# Create the 'cv_df' column using lambda function
df_grouped_condition['cv_df'] = df_grouped_condition['Condition code icd10 subcategory'].apply(lambda text: 1 if any(phrase.lower() in text.lower() for phrase in phrases) else 0)
df_grouped_condition

Unnamed: 0,Internalpatientid,Condition code icd10 subcategory,cv_df
0,1,"Hyperlipidemia, unspecified,Other and unspecif...",1
1,2,"Not specified,Fracture of coccyx",0
2,3,"Acquired absence of leg below knee,Encounter f...",1
3,4,"Unspecified abdominal pain,Systolic (congestiv...",1
4,5,Encounter for screening for infectious and par...,0
...,...,...,...
132064,169060,Encounter for other specified aftercare,0
132065,169061,Other specified counseling,0
132066,169062,"Heart failure, unspecified,Heart failure, unsp...",1
132067,169063,Not specified,0


In [123]:
df_grouped_condition.drop(['Condition code icd10 subcategory'],axis=1,inplace=True)

In [125]:
df_grouped_condition = df_grouped_condition

In [126]:
df_grouped_condition

Unnamed: 0,Internalpatientid,cv_df
0,1,1
1,2,0
2,3,1
3,4,1
4,5,0
...,...,...
132064,169060,0
132065,169061,0
132066,169062,1
132067,169063,0


In [128]:
df_grouped_condition.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_train/df_cv_noncv_train.csv')