# **The Cardiac and Non cardiac patients Notebook** 

In [33]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [34]:
#['data_team3_synthetic_test']
datastore = workspace.datastores['data_team3_synthetic_test'] 

In [71]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'conditions_test.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [72]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
conditions_train_data= dataset.to_pandas_dataframe()

# Importing the library

In [74]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

# Set the maximum number of rows to be displayed to 1000
pd.set_option('display.max_rows', 1163)

# Elimination of the unnamed columns

In [75]:
# Remove unwanted columns 
conditions_train_data.drop('Column1',axis=1,inplace=True)

In [76]:
conditions_train_data =  conditions_train_data

# Sorting out patients id and age column

In [80]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
conditions_train_data.sort_values(["Internalpatientid","Age at condition documentation"],inplace=True)

# Checking the missing value

In [83]:
print("Training set missing values:\n", conditions_train_data.isna().sum())

Training set missing values:
 Internalpatientid                         0
Age at condition documentation            0
Condition documented date                 0
Condition type                            0
Condition code icd10 subcategory          0
Diagnosis sequence number or rank    960030
State                                     0
dtype: int64


# Taking the two digits in age column 

In [84]:
# Format 'Age at condition documentation' values in the format 00.00
conditions_train_data["Age at condition documentation"] = conditions_train_data["Age at condition documentation"].map("{:.2f}".format)

In [85]:
conditions_train_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
15352101,6,74.77,2001-06-13 08:56:37,Diagnosis,"Heart failure, unspecified",S,Texas
15352102,6,74.77,2001-06-13 08:56:37,Diagnosis,Atherosclerotic heart disease of native corona...,P,Texas
15352103,6,74.77,2001-06-13 08:56:37,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Texas
22032759,6,74.77,2001-06-13 08:57:38,Diagnosis,"Heart failure, unspecified",S,Texas
22032760,6,74.77,2001-06-13 08:57:38,Diagnosis,Unspecified atrial fibrillation and atrial flu...,S,Texas
...,...,...,...,...,...,...,...
9577050,169065,53.32,2011-06-11 16:12:21,Diagnosis,Sepsis due to Hemophilus influenzae,5,Arizona
9577051,169065,53.32,2011-06-11 16:12:21,Diagnosis,Acute kidney failure with tubular necrosis,7,Arizona
9577052,169065,53.32,2011-06-11 16:12:21,Diagnosis,Acute respiratory failure,1,Arizona
9577053,169065,53.32,2011-06-11 16:12:21,Diagnosis,Pleural effusion in other conditions classifie...,10,Arizona


In [86]:
# Convert 'Age at condition documentation' column from object to float
conditions_train_data["Age at condition documentation"] = conditions_train_data["Age at condition documentation"].astype(float)

# Taking the Max age

In [88]:
# Find the latest records for each internal patient id
max_ages = conditions_train_data.groupby('Internalpatientid')['Age at condition documentation'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
conditions_train_data = pd.merge(conditions_train_data, max_ages, on =['Internalpatientid','Age at condition documentation'], how = 'inner')

conditions_train_data

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,6,88.10,2014-10-16 01:23:22,Diagnosis,Other specified counseling,P,Texas
1,7,74.37,2020-11-22 12:50:02,Diagnosis,Dental caries on smooth surface,P,Nebraska
2,9,51.88,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
3,9,51.88,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
4,9,51.88,2000-04-26 17:39:58,Diagnosis,Nonrheumatic mitral (valve) insufficiency,4,Texas
...,...,...,...,...,...,...,...
225895,169065,53.32,2011-06-11 16:12:21,Diagnosis,Sepsis due to Hemophilus influenzae,5,Arizona
225896,169065,53.32,2011-06-11 16:12:21,Diagnosis,Acute kidney failure with tubular necrosis,7,Arizona
225897,169065,53.32,2011-06-11 16:12:21,Diagnosis,Acute respiratory failure,1,Arizona
225898,169065,53.32,2011-06-11 16:12:21,Diagnosis,Pleural effusion in other conditions classifie...,10,Arizona


# Rounding off the Age column

In [89]:
conditions_train_data['Age at condition documentation'] = conditions_train_data['Age at condition documentation'].apply(lambda x: round(x,))

In [91]:
# value counts
df = conditions_train_data["Internalpatientid"].value_counts().loc[lambda x: x  > 60]

In [92]:
df = df.reset_index() 

In [93]:
df.columns = ["Internalpatientid","count_conditions"]

In [94]:
df.drop('count_conditions', axis = 1, inplace = True)

In [97]:
len(df)

327

In [98]:
# Merge the two dataframes based on 'Internalpatientid'
merged_df = conditions_train_data.merge(df, on='Internalpatientid', how='left')

# Filter rows from DataFrame 1 that have matching values in DataFrame 2
filtered_df = merged_df[~merged_df['Internalpatientid'].isin(df['Internalpatientid'])]

# Print only the columns of DataFrame 1
filtered_df

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,6,88,2014-10-16 01:23:22,Diagnosis,Other specified counseling,P,Texas
1,7,74,2020-11-22 12:50:02,Diagnosis,Dental caries on smooth surface,P,Nebraska
2,9,52,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
3,9,52,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
4,9,52,2000-04-26 17:39:58,Diagnosis,Nonrheumatic mitral (valve) insufficiency,4,Texas
...,...,...,...,...,...,...,...
225895,169065,53,2011-06-11 16:12:21,Diagnosis,Sepsis due to Hemophilus influenzae,5,Arizona
225896,169065,53,2011-06-11 16:12:21,Diagnosis,Acute kidney failure with tubular necrosis,7,Arizona
225897,169065,53,2011-06-11 16:12:21,Diagnosis,Acute respiratory failure,1,Arizona
225898,169065,53,2011-06-11 16:12:21,Diagnosis,Pleural effusion in other conditions classifie...,10,Arizona


In [99]:
df_new = filtered_df

In [100]:
df_new

Unnamed: 0,Internalpatientid,Age at condition documentation,Condition documented date,Condition type,Condition code icd10 subcategory,Diagnosis sequence number or rank,State
0,6,88,2014-10-16 01:23:22,Diagnosis,Other specified counseling,P,Texas
1,7,74,2020-11-22 12:50:02,Diagnosis,Dental caries on smooth surface,P,Nebraska
2,9,52,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
3,9,52,2000-04-26 17:39:58,Diagnosis,Hypertensive heart disease with heart failure ...,5,Texas
4,9,52,2000-04-26 17:39:58,Diagnosis,Nonrheumatic mitral (valve) insufficiency,4,Texas
...,...,...,...,...,...,...,...
225895,169065,53,2011-06-11 16:12:21,Diagnosis,Sepsis due to Hemophilus influenzae,5,Arizona
225896,169065,53,2011-06-11 16:12:21,Diagnosis,Acute kidney failure with tubular necrosis,7,Arizona
225897,169065,53,2011-06-11 16:12:21,Diagnosis,Acute respiratory failure,1,Arizona
225898,169065,53,2011-06-11 16:12:21,Diagnosis,Pleural effusion in other conditions classifie...,10,Arizona


# Dropping all unnecessary columns

In [101]:
# # Remove columns 
df_new.drop(["Age at condition documentation","Condition documented date","Condition type","Diagnosis sequence number or rank","State"], axis=1,inplace=True)

In [102]:
df_new = df_new

In [107]:
df_new.Internalpatientid.nunique()

34486

In [112]:
df_new

Unnamed: 0,Internalpatientid,Condition code icd10 subcategory
0,6,Other specified counseling
1,7,Dental caries on smooth surface
2,9,Hypertensive heart disease with heart failure ...
3,9,Hypertensive heart disease with heart failure ...
4,9,Nonrheumatic mitral (valve) insufficiency
...,...,...
225895,169065,Sepsis due to Hemophilus influenzae
225896,169065,Acute kidney failure with tubular necrosis
225897,169065,Acute respiratory failure
225898,169065,Pleural effusion in other conditions classifie...


# Importing the inpatient test file

In [105]:
df_inpa = pd.read_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_test/Potential_files_test/df_inpatient_admission_test.csv')

In [106]:
df_inpa 

Unnamed: 0.1,Unnamed: 0,Internalpatientid,inpatient_admissions_First_listed_discharge_diagnosis_icd10_subcategory,inpatient_admissions_Second_listed_discharge_diagnosis_icd10_subcategory
0,0,7,68_Intestinal adhesions [bands] with obstructi...,68_Type 2 diabetes mellitus without complications
1,1,9,"52_Other restrictive cardiomyopathy,52_Acute k...","52_Nonrheumatic mitral (valve) insufficiency,5..."
2,2,12,74_Other forms of chronic ischemic heart disease,"74_Volume depletion, unspecified"
3,3,17,"82_Acute kidney failure, unspecified",82_Hypertensive chronic kidney disease with st...
4,4,22,61_Unspecified complication of internal prosth...,"61_Not specified,61_Unspecified complication o..."
...,...,...,...,...
22250,22250,168995,75_Chronic obstructive pulmonary disease with ...,75_Acute and chronic respiratory failure
22251,22251,169011,"69_Major depressive disorder, recurrent",69_Other symptoms and signs involving emotiona...
22252,22252,169037,"85_Fracture of acetabulum,85_(Censored)","85_Fracture of acetabulum,85_Fracture of pubis"
22253,22253,169059,80_Embolism and thrombosis of other specified ...,80_Other hyperlipidemia


In [108]:
# Combine the two columns into a new column separated by commas
df_inpa ['Inpatient_admission_first_and_second_listed'] = df_inpa['inpatient_admissions_First_listed_discharge_diagnosis_icd10_subcategory'] + ',' + df_inpa['inpatient_admissions_Second_listed_discharge_diagnosis_icd10_subcategory']

In [110]:
# droping the unwanted columns
df_f_s = df_inpa.drop(["Unnamed: 0","inpatient_admissions_First_listed_discharge_diagnosis_icd10_subcategory","inpatient_admissions_Second_listed_discharge_diagnosis_icd10_subcategory"],axis=1)

In [111]:
df_f_s 

Unnamed: 0,Internalpatientid,Inpatient_admission_first_and_second_listed
0,7,68_Intestinal adhesions [bands] with obstructi...
1,9,"52_Other restrictive cardiomyopathy,52_Acute k..."
2,12,74_Other forms of chronic ischemic heart disea...
3,17,"82_Acute kidney failure, unspecified,82_Hypert..."
4,22,61_Unspecified complication of internal prosth...
...,...,...
22250,168995,75_Chronic obstructive pulmonary disease with ...
22251,169011,"69_Major depressive disorder, recurrent,69_Oth..."
22252,169037,"85_Fracture of acetabulum,85_(Censored),85_Fra..."
22253,169059,80_Embolism and thrombosis of other specified ...


# Merging the condition file and inpatients file 

In [113]:
df_final = pd.merge(df_new,df_f_s ,on="Internalpatientid", how="outer")

In [114]:
df_final

Unnamed: 0,Internalpatientid,Condition code icd10 subcategory,Inpatient_admission_first_and_second_listed
0,6,Other specified counseling,
1,7,Dental caries on smooth surface,68_Intestinal adhesions [bands] with obstructi...
2,9,Hypertensive heart disease with heart failure ...,"52_Other restrictive cardiomyopathy,52_Acute k..."
3,9,Hypertensive heart disease with heart failure ...,"52_Other restrictive cardiomyopathy,52_Acute k..."
4,9,Nonrheumatic mitral (valve) insufficiency,"52_Other restrictive cardiomyopathy,52_Acute k..."
...,...,...,...
199574,167710,,"73_Malignant neoplasm of upper lobe, bronchus ..."
199575,167963,,84_Hypertensive heart and chronic kidney disea...
199576,168458,,"98_Acute kidney failure, unspecified,98_Unspec..."
199577,168778,,"85_Other and unspecified encephalopathy,85_Oth..."


In [115]:
# checking the null values
df_final.isnull().sum()

Internalpatientid                                  0
Condition code icd10 subcategory                 327
Inpatient_admission_first_and_second_listed    44342
dtype: int64

In [116]:
# Replace NaN values in Condition code icd10 subcategory with corresponding 'Inpatient_admission_first_and_second_listed' values
df_final['Condition code icd10 subcategory'] = df_final['Condition code icd10 subcategory'].fillna(df_final['Inpatient_admission_first_and_second_listed'])

In [120]:
# after replacing the null values
df_final.isnull().sum()

Internalpatientid                                  0
Condition code icd10 subcategory                   0
Inpatient_admission_first_and_second_listed    44342
dtype: int64

In [121]:
# droping the "Inpatient_admission_first_and_second_listed" column
df_final.drop(["Inpatient_admission_first_and_second_listed"],axis=1,inplace=True)

In [125]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = df_final.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_condition = df_grouped.reset_index()
df_grouped_condition

Unnamed: 0,Internalpatientid,Condition code icd10 subcategory
0,6,Other specified counseling
1,7,Dental caries on smooth surface
2,9,Hypertensive heart disease with heart failure ...
3,12,"Polyosteoarthritis, unspecified ; Osteoarthrit..."
4,17,"Encounter for other specified aftercare,Acute ..."
...,...,...
34808,169037,"Other forms of chronic ischemic heart disease,..."
34809,169045,"Tinnitus,Sensorineural hearing loss, bilateral..."
34810,169058,"Malignant neoplasm of upper lobe, bronchus or ..."
34811,169059,Problem related to unspecified psychosocial ci...


# Finding the Cv(1) or Non-cv(0)

In [126]:
# Define the list of heart-related words
phrases = ['Systolic','congestive','Heart','Diastolic','Cardiogenic','Hypertensive heart','myocardial','STEMI','NSTEMI',
           'Unstable angina','chronic ischemic heart disease','Atherosclerotic','native coronary artery',
           'Atrial fibrillation','Atrial flutter','Supraventricular tachycardia','Ventricular tachycardia']

In [127]:
# Create the 'cv_df' column using lambda function
df_grouped_condition['cv_df'] = df_grouped_condition['Condition code icd10 subcategory'].apply(lambda text: 1 if any(phrase.lower() in text.lower() for phrase in phrases) else 0)
df_grouped_condition

Unnamed: 0,Internalpatientid,Condition code icd10 subcategory,cv_df
0,6,Other specified counseling,0
1,7,Dental caries on smooth surface,0
2,9,Hypertensive heart disease with heart failure ...,1
3,12,"Polyosteoarthritis, unspecified ; Osteoarthrit...",1
4,17,"Encounter for other specified aftercare,Acute ...",0
...,...,...,...
34808,169037,"Other forms of chronic ischemic heart disease,...",1
34809,169045,"Tinnitus,Sensorineural hearing loss, bilateral...",0
34810,169058,"Malignant neoplasm of upper lobe, bronchus or ...",1
34811,169059,Problem related to unspecified psychosocial ci...,0


In [128]:
df_grouped_condition.drop(['Condition code icd10 subcategory'],axis=1,inplace=True)

In [129]:
df_grouped_condition = df_grouped_condition

In [131]:
df_grouped_condition

Unnamed: 0,Internalpatientid,cv_df
0,6,0
1,7,0
2,9,1
3,12,1
4,17,0
...,...,...
34808,169037,1
34809,169045,0
34810,169058,1
34811,169059,0


# Savig tthe csv file

In [133]:
df_grouped_condition.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_test/df_cv_non_cv_test_V1.csv')