In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

In [3]:
#['data_team3_synthetic_train']
datastore = workspace.datastores['data_team3_synthetic_quality_check'] 

In [4]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'outpatient_visits_qual.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [5]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
outpatient_train_data = dataset.to_pandas_dataframe()

In [6]:
outpatient_train_data.Internalpatientid.nunique()

1000

# Elimination of the unnamed columns

In [7]:
# Remove unwanted columns 
outpatient_train_data.drop('Column1',axis=1,inplace=True)

In [8]:
outpatient_train_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
0,100012,52.440184,2001-12-08 21:17:36,Not specified,Not specified,TELEPHONE TRIAGE,,,,False,,Texas
1,100012,52.570897,2002-01-25 15:54:35,Not specified,Not specified,EMERGENCY UNIT,,,,,,New Mexico
2,100012,52.603491,2002-02-06 13:49:30,Not specified,Not specified,PRIMARY CARE/MEDICINE,,,,False,,New Mexico
3,100012,52.678927,2002-03-06 03:32:37,Not specified,Not specified,TELEPHONE TRIAGE,,,,,,New Mexico
4,100012,52.706935,2002-03-16 09:14:15,Not specified,Not specified,TELEPHONE TRIAGE,,,,False,,New Mexico
...,...,...,...,...,...,...,...,...,...,...,...,...
627697,99944,79.489989,2012-01-04 14:10:45,Not specified,Not specified,OCCUPATIONAL THERAPY,,,,,,Massachusetts
627698,99944,80.012876,2012-07-13 16:57:06,Not specified,Not specified,HBPC - SOCIAL WORKER,,,,,,Massachusetts
627699,99944,80.320453,2012-11-03 03:01:01,Not specified,Not specified,HBPC Nursing (RN / LP),,,,,,Massachusetts
627700,99944,80.438034,2012-12-16 02:26:00,Not specified,Not specified,HBPC Nursing (RN / LP),,,,,,Massachusetts


# sorting out patients id and age column

In [9]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
outpatient_train_data.sort_values(["Internalpatientid","Age at visit"],inplace=True)

In [10]:
outpatient_train_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
574990,67,48.273610,2010-08-06 00:06:10,Not specified,Not specified,PRIMARY CARE/MEDICINE,,,,,,California
390106,67,48.278791,2010-08-07 21:33:09,Not specified,Not specified,LABORATORY,,,,,,California
390107,67,48.278793,2010-08-07 21:34:08,Not specified,Not specified,LABORATORY,,,,,,California
390108,67,48.278793,2010-08-07 21:34:08,Not specified,Not specified,LABORATORY,,,,,,California
390791,67,48.284581,2010-08-10 00:20:48,Not specified,Not specified,GENERAL INTERNAL MEDICINE,,,,,,California
...,...,...,...,...,...,...,...,...,...,...,...,...
452341,168899,96.897697,2019-09-22 02:35:38,Not specified,Not specified,ADMIN PAT ACTIVTIES (MASNONCT),,,,,,Pennsylvania
218992,168899,96.905469,2019-09-24 22:46:28,Encounter for other specified aftercare,Unspecified atrial fibrillation and atrial flu...,ANTI-COAGULATION CLINIC,,,,,,Pennsylvania
452342,168899,96.969163,2019-10-18 05:30:02,Not specified,Not specified,TELEPHONE PRIMARY CARE,,,,,,Pennsylvania
219223,168899,97.039824,2019-11-13 01:20:01,Not specified,Not specified,TELEPHONE PRIMARY CARE,,,,,,Pennsylvania


# checking the missing valuses

In [11]:
print("set missing values:\n", outpatient_train_data.isna().sum())

set missing values:
 Internalpatientid                                 0
Age at visit                                      0
Visit start date                                  0
First listed diagnosis icd10 subcategory          0
Second listed diagnosis icd10 subcategory         0
Stop code                                         0
Agentorangeflag                              603925
Combatflag                                   626916
Ionizingradiationflag                        625932
Serviceconnectedflag                         486950
Swasiaconditionsflag                         625217
State                                             0
dtype: int64


* There is no missing value in the potential attributes,but lot of "Not specified" values in PA

In [12]:
#Checking the values each category in condition type

# Count the occurrences of each label
first_list = outpatient_train_data["First listed diagnosis icd10 subcategory"].value_counts().loc[lambda x: x>200]
first_list

Not specified                                                   534964
Other specified counseling                                        5571
Encounter for other specified aftercare                           3129
Counseling, unspecified                                           2471
Essential (primary) hypertension                                  2184
                                                                 ...  
Other problems related to housing and economic circumstances       221
Partial loss of teeth                                              218
Hyperlipidemia, unspecified                                        217
Age-related physical debility                                      210
Deposits [accretions] on teeth                                     204
Name: First listed diagnosis icd10 subcategory, Length: 79, dtype: int64

In [13]:
#Checking the values each category in condition type

# Count the occurrences of each label
sec_list = outpatient_train_data["Second listed diagnosis icd10 subcategory"].value_counts().loc[lambda x: x>200]
sec_list

Not specified                                                                      594477
Essential (primary) hypertension                                                     1551
Unspecified atrial fibrillation and atrial flutter                                   1466
Long term (current) use of anticoagulants and antithrombotics/antiplatelets           874
Type 2 diabetes mellitus without complications                                        846
Encounter for other specified aftercare                                               647
Post-traumatic stress disorder (PTSD)                                                 586
Hyperlipidemia, unspecified                                                           449
Chronic obstructive pulmonary disease, unspecified                                    448
Atherosclerotic heart disease of native coronary artery                               410
Other long term (current) drug therapy                                                336
Systolic (

In [14]:
# Remove the rows where both columns contain "Not Specified"

outpatient_train_data = outpatient_train_data.drop(outpatient_train_data[(outpatient_train_data['First listed diagnosis icd10 subcategory'] == 'Not specified') & (outpatient_train_data['Second listed diagnosis icd10 subcategory'] == 'Not specified')].index)

In [15]:
outpatient_train_data.Internalpatientid.nunique()

524

# Taking the digit in age coulmn

In [16]:
# Format 'Age at measurement' values in the format 00.00
outpatient_train_data["Age at visit"] = outpatient_train_data["Age at visit"].map("{:.2f}".format)

In [17]:
outpatient_train_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
390118,67,52.29,2014-08-14 15:51:06,Essential (primary) hypertension,"Chronic kidney disease, unspecified",PRIMARY CARE/MEDICINE,,,,,,California
573223,67,52.29,2014-08-14 15:55:09,Encounter for immunization,Not specified,ADMIN PAT ACTIVTIES (MASNONCT),,,,,,California
391029,67,52.30,2014-08-15 14:03:05,Other sleep disorders,Not specified,SLEEP MEDICINE,,,,,,California
391030,67,52.34,2014-08-31 03:47:06,Essential (primary) hypertension,Not specified,PRIMARY CARE/MEDICINE,,,,,,California
574998,67,52.81,2015-02-18 03:14:42,Essential (primary) hypertension,Low back pain,PRIMARY CARE/MEDICINE,,,,,,California
...,...,...,...,...,...,...,...,...,...,...,...,...
445586,168899,96.39,2019-03-18 18:54:32,Encounter for other specified aftercare,Unspecified atrial fibrillation and atrial flu...,ANTI-COAGULATION CLINIC,,,,,,Pennsylvania
443615,168899,96.48,2019-04-23 16:50:17,Other specified counseling,Not specified,TELEPHONE PRIMARY CARE,,,,,,Pennsylvania
218991,168899,96.75,2019-07-31 14:05:25,Encounter for other specified aftercare,Unspecified atrial fibrillation and atrial flu...,ANTI-COAGULATION CLINIC,,,,,,Pennsylvania
443617,168899,96.87,2019-09-13 03:48:33,Encounter for other specified aftercare,Unspecified atrial fibrillation and atrial flu...,ANTI-COAGULATION CLINIC,,,,,,Pennsylvania


In [18]:
# Convert 'Age' column from object to float
outpatient_train_data["Age at visit"] = outpatient_train_data["Age at visit"].astype(float)

# Max age

In [19]:
# Find the maximum age for each internal patient id
max_ages = outpatient_train_data.groupby('Internalpatientid')['Age at visit'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
outpatient_train_data = pd.merge(outpatient_train_data, max_ages, on =['Internalpatientid','Age at visit'], how = 'inner')

outpatient_train_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
0,67,58.48,2020-10-22 11:49:50,Encounter for immunization,Not specified,GENERAL INTERNAL MEDICINE,,,,,,California
1,200,87.80,2023-02-13 10:31:19,Vascular dementia,Not specified,HBPC - THERAPIST,,,,,,Utah
2,330,74.66,2024-03-01 00:19:37,Encounter for immunization,Not specified,PRIMARY CARE/MEDICINE,,,,False,,Oregon
3,351,85.56,2023-10-18 20:54:27,Other chronic osteomyelitis,Not specified,TELEPHONE/ANCILLARY,,,,False,,Ohio
4,714,67.66,2015-04-17 12:26:12,Hypertensive chronic kidney disease with stage...,Panic disorder [episodic paroxysmal anxiety],PRIMARY CARE/MEDICINE,,,,False,,Georgia
...,...,...,...,...,...,...,...,...,...,...,...,...
880,168008,73.33,2013-04-13 23:59:08,"Counseling, unspecified",Not specified,TELEPHONE TRIAGE,Yes,,,False,,Texas
881,168008,73.33,2013-04-15 23:49:32,Malignant carcinoid tumors,Encounter for other specified aftercare,PALLIATIVE CARE,No,,,False,,Texas
882,168008,73.33,2013-04-16 06:30:58,Pneumonia due to other specified bacteria,Encounter for other specified aftercare,PALLIATIVE CARE,,,,,,Texas
883,168496,98.72,2022-11-03 09:59:52,Diastolic (congestive) heart failure,Essential (primary) hypertension,TELEPHONE/MEDICINE,,,,,,California


# Rounding off the Age 

In [20]:
outpatient_train_data['Age at visit'] = outpatient_train_data['Age at visit'].apply(lambda x: round(x,))

In [21]:
outpatient_train_data.Internalpatientid.nunique()

524

In [22]:
df = outpatient_train_data["Internalpatientid"].value_counts().loc[lambda x: x>25].to_frame()

In [23]:
df = df.reset_index()

In [24]:
df.columns = ["Internalpatientid","count_outpatients"]

In [25]:
len(df)

0

In [26]:
df.drop(["count_outpatients"],axis=1,inplace =True)

In [27]:
df

Unnamed: 0,Internalpatientid


In [28]:
# Merge the two dataframes based on 'Internalpatientid'
merged_df = outpatient_train_data.merge(df, on='Internalpatientid', how='left')

# Filter rows from DataFrame 1 that have matching values in DataFrame 2
filtered_df = merged_df[~merged_df['Internalpatientid'].isin(df['Internalpatientid'])]

# Print only the columns of DataFrame 1
filtered_df

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
0,67,58,2020-10-22 11:49:50,Encounter for immunization,Not specified,GENERAL INTERNAL MEDICINE,,,,,,California
1,200,88,2023-02-13 10:31:19,Vascular dementia,Not specified,HBPC - THERAPIST,,,,,,Utah
2,330,75,2024-03-01 00:19:37,Encounter for immunization,Not specified,PRIMARY CARE/MEDICINE,,,,False,,Oregon
3,351,86,2023-10-18 20:54:27,Other chronic osteomyelitis,Not specified,TELEPHONE/ANCILLARY,,,,False,,Ohio
4,714,68,2015-04-17 12:26:12,Hypertensive chronic kidney disease with stage...,Panic disorder [episodic paroxysmal anxiety],PRIMARY CARE/MEDICINE,,,,False,,Georgia
...,...,...,...,...,...,...,...,...,...,...,...,...
880,168008,73,2013-04-13 23:59:08,"Counseling, unspecified",Not specified,TELEPHONE TRIAGE,Yes,,,False,,Texas
881,168008,73,2013-04-15 23:49:32,Malignant carcinoid tumors,Encounter for other specified aftercare,PALLIATIVE CARE,No,,,False,,Texas
882,168008,73,2013-04-16 06:30:58,Pneumonia due to other specified bacteria,Encounter for other specified aftercare,PALLIATIVE CARE,,,,,,Texas
883,168496,99,2022-11-03 09:59:52,Diastolic (congestive) heart failure,Essential (primary) hypertension,TELEPHONE/MEDICINE,,,,,,California


In [29]:
filtered_df.Internalpatientid.nunique()

524

In [30]:
outpatient_train_data = filtered_df  # Assinging new varible

# Adding  'Age at visit' & 'First listed diagnosis icd10 subcategory' columns with '_'

In [31]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
outpatient_train_data['outpatients_First_listed_diagnosis_icd10_subcategory'] =  outpatient_train_data['Age at visit'].astype(str) + '_' +  outpatient_train_data['First listed diagnosis icd10 subcategory']

# Adding  'Age at visit' & 'Second listed diagnosis icd10 subcategory' columns with '_'

In [32]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
outpatient_train_data['outpatients_Second_listed_diagnosis_icd10_subcategory'] =  outpatient_train_data['Age at visit'].astype(str) + '_' +  outpatient_train_data['Second listed diagnosis icd10 subcategory']

# Dropping all unnecessary columns

In [33]:
# Remove columns 
outpatient_train_data.drop(['Age at visit','Visit start date','First listed diagnosis icd10 subcategory','Second listed diagnosis icd10 subcategory',
                            'Stop code','Agentorangeflag','Combatflag','Ionizingradiationflag','Serviceconnectedflag','Swasiaconditionsflag','State'], axis=1,inplace=True)

In [34]:
outpatient_train_data = outpatient_train_data

In [35]:
outpatient_train_data

Unnamed: 0,Internalpatientid,outpatients_First_listed_diagnosis_icd10_subcategory,outpatients_Second_listed_diagnosis_icd10_subcategory
0,67,58_Encounter for immunization,58_Not specified
1,200,88_Vascular dementia,88_Not specified
2,330,75_Encounter for immunization,75_Not specified
3,351,86_Other chronic osteomyelitis,86_Not specified
4,714,68_Hypertensive chronic kidney disease with st...,68_Panic disorder [episodic paroxysmal anxiety]
...,...,...,...
880,168008,"73_Counseling, unspecified",73_Not specified
881,168008,73_Malignant carcinoid tumors,73_Encounter for other specified aftercare
882,168008,73_Pneumonia due to other specified bacteria,73_Encounter for other specified aftercare
883,168496,99_Diastolic (congestive) heart failure,99_Essential (primary) hypertension


In [36]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = outpatient_train_data.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_outpatients = df_grouped.reset_index()
df_grouped_outpatients

Unnamed: 0,Internalpatientid,outpatients_First_listed_diagnosis_icd10_subcategory,outpatients_Second_listed_diagnosis_icd10_subcategory
0,67,58_Encounter for immunization,58_Not specified
1,200,88_Vascular dementia,88_Not specified
2,330,75_Encounter for immunization,75_Not specified
3,351,86_Other chronic osteomyelitis,86_Not specified
4,714,68_Hypertensive chronic kidney disease with st...,68_Panic disorder [episodic paroxysmal anxiety...
...,...,...,...
519,167842,"75_Other specified counseling,75_Chronic obstr...","75_Not specified,75_Not specified"
520,167917,51_Presence of other cardiac and vascular impl...,51_Not specified
521,168008,"73_Counseling, unspecified,73_Malignant carcin...","73_Not specified,73_Encounter for other specif..."
522,168496,99_Diastolic (congestive) heart failure,99_Essential (primary) hypertension


# Saving the CSV file

In [37]:
df_grouped_outpatients.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_quality/df_outpatients_quality.csv')

In [102]:
df_grouped_outpatients['outpatients_First_listed_diagnosis_icd10_subcategory'].values[1]

'69_Encounter for screening for other specified diseases and disorders,69_Encounter for screening for other specified diseases and disorders'

In [34]:
import os
cwd = os.getcwd()
cwd

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/2211575'