In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
#['data_team3_synthetic_test']
datastore = workspace.datastores['data_team3_synthetic_test'] 

In [3]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'conditions_test.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [38]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
conditions_test_data= dataset.to_pandas_dataframe()

In [39]:
conditions_test_data

Unnamed: 0,Column1,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,74,100,52.885148,2014-08-16 12:31:20,Diagnosis,Motor neuron disease,S,New York
1,75,100,52.885148,2014-08-16 12:31:20,Diagnosis,Encounter for other specified aftercare,P,New York
2,76,100,54.585028,2016-04-28 19:51:50,Diagnosis,Motor neuron disease,P,New York
3,77,100,55.055466,2016-10-17 18:33:07,Diagnosis,Other specified counseling,P,New York
4,78,100,55.072580,2016-10-24 00:40:17,Diagnosis,Motor neuron disease,P,New York
...,...,...,...,...,...,...,...,...
23283462,112205170,99997,85.433634,2014-07-28 23:50:09,Diagnosis,"Other specified personal risk factors, not els...",P,North Carolina
23283463,112205171,99997,85.709524,2014-11-06 19:56:38,Problem,Sleep apnea,,North Carolina
23283464,112205172,99997,88.107636,2017-04-01 08:10:57,Diagnosis,"Peripheral vascular disease, unspecified",P,North Carolina
23283465,112205173,99997,89.688756,2018-10-31 05:46:10,Diagnosis,Other specified counseling,P,North Carolina


In [6]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

# Set the maximum number of rows to be displayed to 1000
pd.set_option('display.max_rows', 1163)

# **Elimination of the unnamed columns**

In [40]:
# Remove unwanted columns 
conditions_test_data.drop('Column1',axis=1,inplace=True)

In [41]:
conditions_test_data =  conditions_test_data

# Sorting patients id and age column

In [43]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
conditions_test_data.sort_values(["Internalpatientid","Age at condition documentation"],inplace=True)

# Checking the missing value

In [44]:
print("Training set missing values:\n", conditions_test_data.isna().sum())

Training set missing values:
 Internalpatientid                         0
Age at condition documentation            0
Condition documented date                 0
Condition type                            0
Condition code icd10 subcategory          0
Diagnosis sequence number or rank    960030
State                                     0
dtype: int64


# Taking the digits 

In [45]:
# Format 'Age at condition documentation' values in the format 00.00
conditions_test_data["Age at condition documentation"] = conditions_test_data["Age at condition documentation"].map("{:.2f}".format)

In [46]:
conditions_test_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
15352101,6,74.77,2001-06-13 08:56:37,Diagnosis,"Heart failure, unspecified",S,Texas
15352102,6,74.77,2001-06-13 08:56:37,Diagnosis,Atherosclerotic heart disease of native corona...,P,Texas
15352103,6,74.77,2001-06-13 08:56:37,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Texas
22032759,6,74.77,2001-06-13 08:57:38,Diagnosis,"Heart failure, unspecified",S,Texas
22032760,6,74.77,2001-06-13 08:57:38,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Texas
...,...,...,...,...,...,...,...
9577050,169065,53.32,2011-06-11 16:12:21,Diagnosis,Sepsis due to Hemophilus influenzae,5,Arizona
9577051,169065,53.32,2011-06-11 16:12:21,Diagnosis,Acute kidney failure with tubular necrosis,7,Arizona
9577052,169065,53.32,2011-06-11 16:12:21,Diagnosis,Acute respiratory failure,1,Arizona
9577053,169065,53.32,2011-06-11 16:12:21,Diagnosis,Pleural effusion in other conditions classifie...,10,Arizona


In [47]:
# Convert 'Age at condition documentation' column from object to float
conditions_test_data["Age at condition documentation"] = conditions_test_data["Age at condition documentation"].astype(float)

In [48]:
conditions_test_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
15352101,6,74.77,2001-06-13 08:56:37,Diagnosis,"Heart failure, unspecified",S,Texas
15352102,6,74.77,2001-06-13 08:56:37,Diagnosis,Atherosclerotic heart disease of native corona...,P,Texas
15352103,6,74.77,2001-06-13 08:56:37,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Texas
22032759,6,74.77,2001-06-13 08:57:38,Diagnosis,"Heart failure, unspecified",S,Texas
22032760,6,74.77,2001-06-13 08:57:38,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Texas
...,...,...,...,...,...,...,...
9577050,169065,53.32,2011-06-11 16:12:21,Diagnosis,Sepsis due to Hemophilus influenzae,5,Arizona
9577051,169065,53.32,2011-06-11 16:12:21,Diagnosis,Acute kidney failure with tubular necrosis,7,Arizona
9577052,169065,53.32,2011-06-11 16:12:21,Diagnosis,Acute respiratory failure,1,Arizona
9577053,169065,53.32,2011-06-11 16:12:21,Diagnosis,Pleural effusion in other conditions classifie...,10,Arizona


# Max age

In [49]:
# Find the latest records for each internal patient id
max_ages = conditions_test_data.groupby('Internalpatientid')['Age at condition documentation'].max().reset_index()

# Merge with the original dataframe to get the rows with the latest age
conditions_test_data = pd.merge(conditions_test_data, max_ages, on =['Internalpatientid','Age at condition documentation'], how = 'inner')

conditions_test_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,6,88.10,2014-10-16 01:23:22,Diagnosis,Other specified counseling,P,Texas
1,7,74.37,2020-11-22 12:50:02,Diagnosis,Dental caries on smooth surface,P,Nebraska
2,9,51.88,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
3,9,51.88,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
4,9,51.88,2000-04-26 17:39:58,Diagnosis,Nonrheumatic mitral (valve) insufficiency,4,Texas
...,...,...,...,...,...,...,...
225895,169065,53.32,2011-06-11 16:12:21,Diagnosis,Sepsis due to Hemophilus influenzae,5,Arizona
225896,169065,53.32,2011-06-11 16:12:21,Diagnosis,Acute kidney failure with tubular necrosis,7,Arizona
225897,169065,53.32,2011-06-11 16:12:21,Diagnosis,Acute respiratory failure,1,Arizona
225898,169065,53.32,2011-06-11 16:12:21,Diagnosis,Pleural effusion in other conditions classifie...,10,Arizona


# Rounding off the Age 

In [50]:
# Round off age column
conditions_test_data['Age at condition documentation'] = conditions_test_data['Age at condition documentation'].apply(lambda x: round(x,))

In [51]:
conditions_test_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,6,88,2014-10-16 01:23:22,Diagnosis,Other specified counseling,P,Texas
1,7,74,2020-11-22 12:50:02,Diagnosis,Dental caries on smooth surface,P,Nebraska
2,9,52,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
3,9,52,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
4,9,52,2000-04-26 17:39:58,Diagnosis,Nonrheumatic mitral (valve) insufficiency,4,Texas
...,...,...,...,...,...,...,...
225895,169065,53,2011-06-11 16:12:21,Diagnosis,Sepsis due to Hemophilus influenzae,5,Arizona
225896,169065,53,2011-06-11 16:12:21,Diagnosis,Acute kidney failure with tubular necrosis,7,Arizona
225897,169065,53,2011-06-11 16:12:21,Diagnosis,Acute respiratory failure,1,Arizona
225898,169065,53,2011-06-11 16:12:21,Diagnosis,Pleural effusion in other conditions classifie...,10,Arizona


In [52]:
conditions_test_data.Internalpatientid.nunique()

34813

In [53]:
# value counts
df = conditions_test_data["Internalpatientid"].value_counts().loc[lambda x: x  > 60]

In [54]:
df = df.reset_index() 

In [55]:
df.columns = ["Internalpatientid","count_conditions"]

In [56]:
df.drop('count_conditions', axis = 1, inplace = True)

In [57]:
len(df)

327

In [58]:
# Merge the two dataframes based on 'Internalpatientid'
merged_df = conditions_test_data.merge(df, on='Internalpatientid', how='left')

# Filter rows from DataFrame 1 that have matching values in DataFrame 2
filtered_df = merged_df[~merged_df['Internalpatientid'].isin(df['Internalpatientid'])]

# Print only the columns of DataFrame 1
filtered_df

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,6,88,2014-10-16 01:23:22,Diagnosis,Other specified counseling,P,Texas
1,7,74,2020-11-22 12:50:02,Diagnosis,Dental caries on smooth surface,P,Nebraska
2,9,52,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
3,9,52,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
4,9,52,2000-04-26 17:39:58,Diagnosis,Nonrheumatic mitral (valve) insufficiency,4,Texas
...,...,...,...,...,...,...,...
225895,169065,53,2011-06-11 16:12:21,Diagnosis,Sepsis due to Hemophilus influenzae,5,Arizona
225896,169065,53,2011-06-11 16:12:21,Diagnosis,Acute kidney failure with tubular necrosis,7,Arizona
225897,169065,53,2011-06-11 16:12:21,Diagnosis,Acute respiratory failure,1,Arizona
225898,169065,53,2011-06-11 16:12:21,Diagnosis,Pleural effusion in other conditions classifie...,10,Arizona


In [59]:
filtered_df.Internalpatientid.nunique()

34486

In [60]:
conditions_test_data = filtered_df # assigning

# Adding  'Age at condition documentation' & 'Condition code icd10 subcategory' columns with '_'

In [61]:
#In this code, we create a new column called 'Condition_code_icd10_subcategory' by concatenating the values of the 'Age at condition documentation' column, underscore ('_'), and the 'Condition code icd10 subcategory' column
#converted to a string using the `astype(str)` function.
conditions_test_data['Condition_code_icd10_subcategory'] =  conditions_test_data['Age at condition documentation'].astype(str) + '_' +  conditions_test_data['Condition code icd10 subcategory']

In [62]:
conditions_test_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State,Condition_code_icd10_subcategory
0,6,88,2014-10-16 01:23:22,Diagnosis,Other specified counseling,P,Texas,88_Other specified counseling
1,7,74,2020-11-22 12:50:02,Diagnosis,Dental caries on smooth surface,P,Nebraska,74_Dental caries on smooth surface
2,9,52,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas,52_Hypertensive heart disease with heart failu...
3,9,52,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas,52_Hypertensive heart disease with heart failu...
4,9,52,2000-04-26 17:39:58,Diagnosis,Nonrheumatic mitral (valve) insufficiency,4,Texas,52_Nonrheumatic mitral (valve) insufficiency
...,...,...,...,...,...,...,...,...
225895,169065,53,2011-06-11 16:12:21,Diagnosis,Sepsis due to Hemophilus influenzae,5,Arizona,53_Sepsis due to Hemophilus influenzae
225896,169065,53,2011-06-11 16:12:21,Diagnosis,Acute kidney failure with tubular necrosis,7,Arizona,53_Acute kidney failure with tubular necrosis
225897,169065,53,2011-06-11 16:12:21,Diagnosis,Acute respiratory failure,1,Arizona,53_Acute respiratory failure
225898,169065,53,2011-06-11 16:12:21,Diagnosis,Pleural effusion in other conditions classifie...,10,Arizona,53_Pleural effusion in other conditions classi...


# Dropping all unnecessary columns

In [63]:
# Remove columns 
conditions_test_data.drop(["Age at condition documentation","Condition documented date","Condition type",
                           "Condition code icd10 subcategory","Diagnosis sequence number or rank","State"], axis=1,inplace=True)

In [64]:
conditions_test_data = conditions_test_data

In [65]:
conditions_test_data 

Unnamed: 0,Internalpatientid,Condition_code_icd10_subcategory
0,6,88_Other specified counseling
1,7,74_Dental caries on smooth surface
2,9,52_Hypertensive heart disease with heart failu...
3,9,52_Hypertensive heart disease with heart failu...
4,9,52_Nonrheumatic mitral (valve) insufficiency
...,...,...
225895,169065,53_Sepsis due to Hemophilus influenzae
225896,169065,53_Acute kidney failure with tubular necrosis
225897,169065,53_Acute respiratory failure
225898,169065,53_Pleural effusion in other conditions classi...


In [66]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = conditions_test_data.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_condition = df_grouped.reset_index()
df_grouped_condition

Unnamed: 0,Internalpatientid,Condition_code_icd10_subcategory
0,6,88_Other specified counseling
1,7,74_Dental caries on smooth surface
2,9,52_Hypertensive heart disease with heart failu...
3,12,"74_Polyosteoarthritis, unspecified ; Osteoarth..."
4,17,"82_Encounter for other specified aftercare,82_..."
...,...,...
34481,169037,88_Other forms of chronic ischemic heart disea...
34482,169045,"98_Tinnitus,98_Sensorineural hearing loss, bil..."
34483,169058,"79_Malignant neoplasm of upper lobe, bronchus ..."
34484,169059,91_Problem related to unspecified psychosocial...


# Saving csv file 

In [68]:
df_grouped_condition.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_test/df_conditions_test.csv')