In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
#['data_team3_synthetic_train']
datastore = workspace.datastores['data_team3_synthetic_test'] 

In [3]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'outpatient_visits_test.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [4]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
outpatient_test_data = dataset.to_pandas_dataframe()

In [6]:
outpatient_test_data

Unnamed: 0,Column1,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
0,93,100,52.560034,2014-04-19 16:37:04,Not specified,Not specified,RESPIRATORY THERAPY,,,,True,,New York
1,94,100,52.898575,2014-08-21 10:17:59,Not specified,Not specified,HBPC Nursing (RN / LP),,,,True,,New York
2,95,100,53.184501,2014-12-03 22:26:25,Not specified,Not specified,ADMIN PAT ACTIVTIES (MASNONCT),,,,,,New York
3,96,100,54.002069,2015-09-28 18:09:01,Not specified,Not specified,OCCUPATIONAL THERAPY,,,,True,,New York
4,97,100,54.337489,2016-01-29 08:27:08,Not specified,Not specified,TELEPHONE HBPC,,,,True,,New York
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21906647,105227001,99997,86.874842,2016-01-06 18:07:06,Not specified,Not specified,PRIMARY CARE/MEDICINE,,,,,,North Carolina
21906648,105227002,99997,87.321281,2016-06-17 22:16:50,Not specified,Not specified,TELEPHONE PRIMARY CARE,,,,,,North Carolina
21906649,105227003,99997,88.378729,2017-07-09 10:12:41,"Bladder disorder, unspecified",Not specified,CLINICAL PHARMACY,,,,,,North Carolina
21906650,105227004,99997,88.523673,2017-08-31 09:39:49,"Alzheimer's disease, unspecified",Sleep apnea,MENTAL HEALTH CLINIC - IND,,,,,,North Carolina


In [7]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

# Elimination of the unnamed columns

In [8]:
# Remove unwanted columns 
outpatient_test_data.drop('Column1',axis=1,inplace=True)

In [9]:
outpatient_test_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
0,100,52.560034,2014-04-19 16:37:04,Not specified,Not specified,RESPIRATORY THERAPY,,,,True,,New York
1,100,52.898575,2014-08-21 10:17:59,Not specified,Not specified,HBPC Nursing (RN / LP),,,,True,,New York
2,100,53.184501,2014-12-03 22:26:25,Not specified,Not specified,ADMIN PAT ACTIVTIES (MASNONCT),,,,,,New York
3,100,54.002069,2015-09-28 18:09:01,Not specified,Not specified,OCCUPATIONAL THERAPY,,,,True,,New York
4,100,54.337489,2016-01-29 08:27:08,Not specified,Not specified,TELEPHONE HBPC,,,,True,,New York
...,...,...,...,...,...,...,...,...,...,...,...,...
21906647,99997,86.874842,2016-01-06 18:07:06,Not specified,Not specified,PRIMARY CARE/MEDICINE,,,,,,North Carolina
21906648,99997,87.321281,2016-06-17 22:16:50,Not specified,Not specified,TELEPHONE PRIMARY CARE,,,,,,North Carolina
21906649,99997,88.378729,2017-07-09 10:12:41,"Bladder disorder, unspecified",Not specified,CLINICAL PHARMACY,,,,,,North Carolina
21906650,99997,88.523673,2017-08-31 09:39:49,"Alzheimer's disease, unspecified",Sleep apnea,MENTAL HEALTH CLINIC - IND,,,,,,North Carolina


# sorting out patients id and age column

In [10]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
outpatient_test_data.sort_values(["Internalpatientid","Age at visit"],inplace=True)

In [11]:
outpatient_test_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
19529781,6,74.147397,2000-10-29 20:58:06,Not specified,Not specified,Not specified (no value),,,,,,Texas
12754738,6,74.228196,2000-11-28 09:43:53,Not specified,Not specified,Not specified (no value),,,,,,Texas
19473351,6,74.767093,2001-06-13 08:56:37,Not specified,Not specified,ADMITTING/SCREENING,,,,,,Texas
19510971,6,74.767095,2001-06-13 08:57:38,Not specified,Not specified,ADMITTING/SCREENING,,,,,,Texas
19527214,6,74.842250,2001-07-10 20:12:49,Not specified,Not specified,LABORATORY,,,,,,Texas
...,...,...,...,...,...,...,...,...,...,...,...,...
7941106,169065,53.320187,2011-06-11 12:41:03,Not specified,Not specified,WARD STOP CODE,,,,,,Arizona
7947015,169065,53.320380,2011-06-11 14:22:18,Not specified,Not specified,UROLOGY CLINIC,,,,False,,Arizona
7935580,169065,53.320650,2011-06-11 16:44:30,Not specified,Not specified,CHAPLAIN SERVICE - INDIVIDUAL,,,,True,,Arizona
15972480,169065,53.323439,2011-06-12 17:12:37,Not specified,Not specified,CHAPLAIN SERVICE - INDIVIDUAL,,,,,,Arizona


# checking the missing valuses

In [12]:
print("set missing values:\n", outpatient_test_data.isna().sum())

set missing values:
 Internalpatientid                                   0
Age at visit                                        0
Visit start date                                    0
First listed diagnosis icd10 subcategory            0
Second listed diagnosis icd10 subcategory           0
Stop code                                           0
Agentorangeflag                              21100085
Combatflag                                   21875423
Ionizingradiationflag                        21878519
Serviceconnectedflag                         16911701
Swasiaconditionsflag                         21836167
State                                               0
dtype: int64


In [13]:
# Remove the rows where both columns contain "Not Specified"

outpatient_test_data = outpatient_test_data.drop(outpatient_test_data[(outpatient_test_data['First listed diagnosis icd10 subcategory'] == 'Not specified') & (outpatient_test_data['Second listed diagnosis icd10 subcategory'] == 'Not specified')].index)

In [14]:
outpatient_test_data.Internalpatientid.nunique()

18117

# Taking the digit in age coulmn

In [15]:
# Format 'Age at measurement' values in the format 00.00
outpatient_test_data["Age at visit"] = outpatient_test_data["Age at visit"].map("{:.2f}".format)

In [16]:
outpatient_test_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
20042738,7,67.69,2014-03-16 01:53:21,Post-traumatic stress disorder (PTSD),Not specified,PCT-POST TRAUMATIC STRESS-GRP,,,,True,,Nebraska
13850759,7,67.71,2014-03-23 08:46:01,Other long term (current) drug therapy,Other and unspecified intestinal obstruction,CLINICAL PHARMACY,,,,True,,Nebraska
20101814,7,67.71,2014-03-23 23:33:03,Encounter for other specified aftercare,Other long term (current) drug therapy,CLINICAL PHARMACY,Yes,,,False,,Nebraska
13842251,7,67.71,2014-03-24 01:44:26,Other long term (current) drug therapy,Long term (current) use of anticoagulants and ...,CLINICAL PHARMACY,,,,,,Nebraska
20042740,7,67.71,2014-03-25 00:19:11,Other long term (current) drug therapy,Not specified,CLINICAL PHARMACY,Yes,,,False,,Nebraska
...,...,...,...,...,...,...,...,...,...,...,...,...
15969115,169045,97.00,2021-03-16 15:28:27,Other specified cardiac arrhythmias,Hypertensive heart disease with heart failure,PRIMARY CARE/MEDICINE,,,,False,,Minnesota
7944375,169045,97.27,2021-06-23 22:06:41,Encounter for examination of ears and hearing,"Sensorineural hearing loss, bilateral",AUDIOLOGY,,,,True,,Minnesota
15663762,169045,97.29,2021-06-30 00:50:22,Encounter for immunization,Not specified,GENERAL INTERNAL MEDICINE,,,,False,,Minnesota
7922508,169045,97.36,2021-07-25 21:32:52,Encounter for immunization,Not specified,GENERAL INTERNAL MEDICINE,,,,False,,Minnesota


In [17]:
# Convert 'Age' column from object to float
outpatient_test_data["Age at visit"] = outpatient_test_data["Age at visit"].astype(float)

# Max age

In [18]:
# Find the maximum age for each internal patient id
max_ages = outpatient_test_data.groupby('Internalpatientid')['Age at visit'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
outpatient_test_data = pd.merge(outpatient_test_data, max_ages, on =['Internalpatientid','Age at visit'], how = 'inner')

outpatient_test_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
0,7,74.37,2020-11-22 12:50:02,Dental caries on smooth surface,Not specified,DENTAL,No,,,False,,Nebraska
1,25,70.12,2023-12-22 18:01:07,Encounter for other specified aftercare,Unspecified atrial fibrillation and atrial flu...,TELEPHONE/ANCILLARY,,,,False,,Pennsylvania
2,31,69.05,2016-10-17 14:46:09,"Counseling, unspecified",Not specified,TELEPHONE PRIMARY CARE,,,,False,,Mississippi
3,41,63.94,2017-06-30 04:07:34,Malignant neoplasm of lower third of esophagus,Secondary malignant neoplasm of liver and intr...,ONCOLOGY/TUMOR,,,,,,Missouri
4,41,63.94,2017-06-30 05:36:26,"Counseling, unspecified",Not specified,SOCIAL WORK SERVICE,,,,,,Missouri
...,...,...,...,...,...,...,...,...,...,...,...,...
29729,169037,87.74,2017-10-24 17:05:33,Chronic obstructive pulmonary disease with (ac...,Not specified,TELEPHONE HBPC,,,,,,Illinois
29730,169037,87.74,2017-10-24 20:00:40,Other specified problems related to psychosoci...,Encounter for palliative care,TELEPHONE/GERIATRICS,,,,,,Illinois
29731,169037,87.74,2017-10-25 16:09:22,Dementia in other diseases classified elsewhere,Encounter for palliative care,HOSPICE CARE,,,,,,Illinois
29732,169037,87.74,2017-10-25 16:33:01,Encounter for palliative care,Dementia in other diseases classified elsewhere,TELEPHONE/GERIATRICS,,,,,,Illinois


# Rounding off the Age 

In [19]:
outpatient_test_data['Age at visit'] = outpatient_test_data['Age at visit'].apply(lambda x: round(x,))

In [20]:
outpatient_test_data.Internalpatientid.nunique()

18117

In [21]:
df = outpatient_test_data["Internalpatientid"].value_counts().loc[lambda x: x>25].to_frame()

In [22]:
df = df.reset_index()

In [23]:
df.columns = ["Internalpatientid","count_outpatients"]

In [24]:
len(df)

16

In [25]:
df.drop(["count_outpatients"],axis=1,inplace =True)

In [26]:
df

Unnamed: 0,Internalpatientid
0,101451
1,127264
2,113669
3,95558
4,85653
5,160044
6,140996
7,103630
8,27822
9,9741


In [27]:
# Merge the two dataframes based on 'Internalpatientid'
merged_df = outpatient_test_data.merge(df, on='Internalpatientid', how='left')

# Filter rows from DataFrame 1 that have matching values in DataFrame 2
filtered_df = merged_df[~merged_df['Internalpatientid'].isin(df['Internalpatientid'])]

# Print only the columns of DataFrame 1
filtered_df

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
0,7,74,2020-11-22 12:50:02,Dental caries on smooth surface,Not specified,DENTAL,No,,,False,,Nebraska
1,25,70,2023-12-22 18:01:07,Encounter for other specified aftercare,Unspecified atrial fibrillation and atrial flu...,TELEPHONE/ANCILLARY,,,,False,,Pennsylvania
2,31,69,2016-10-17 14:46:09,"Counseling, unspecified",Not specified,TELEPHONE PRIMARY CARE,,,,False,,Mississippi
3,41,64,2017-06-30 04:07:34,Malignant neoplasm of lower third of esophagus,Secondary malignant neoplasm of liver and intr...,ONCOLOGY/TUMOR,,,,,,Missouri
4,41,64,2017-06-30 05:36:26,"Counseling, unspecified",Not specified,SOCIAL WORK SERVICE,,,,,,Missouri
...,...,...,...,...,...,...,...,...,...,...,...,...
29729,169037,88,2017-10-24 17:05:33,Chronic obstructive pulmonary disease with (ac...,Not specified,TELEPHONE HBPC,,,,,,Illinois
29730,169037,88,2017-10-24 20:00:40,Other specified problems related to psychosoci...,Encounter for palliative care,TELEPHONE/GERIATRICS,,,,,,Illinois
29731,169037,88,2017-10-25 16:09:22,Dementia in other diseases classified elsewhere,Encounter for palliative care,HOSPICE CARE,,,,,,Illinois
29732,169037,88,2017-10-25 16:33:01,Encounter for palliative care,Dementia in other diseases classified elsewhere,TELEPHONE/GERIATRICS,,,,,,Illinois


In [28]:
filtered_df.Internalpatientid.nunique()

18101

In [29]:
outpatient_test_data = filtered_df  # Assinging new varible

# Adding  'Age at visit' & 'First listed diagnosis icd10 subcategory' columns with '_'

In [30]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
outpatient_test_data['outpatients_First_listed_diagnosis_icd10_subcategory'] =  outpatient_test_data['Age at visit'].astype(str) + '_' +  outpatient_test_data['First listed diagnosis icd10 subcategory']

# Adding  'Age at visit' & 'Second listed diagnosis icd10 subcategory' columns with '_'

In [31]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
outpatient_test_data['outpatients_Second_listed_diagnosis_icd10_subcategory'] =  outpatient_test_data['Age at visit'].astype(str) + '_' +  outpatient_test_data['Second listed diagnosis icd10 subcategory']

# Dropping all unnecessary columns

In [32]:
# Remove columns 
outpatient_test_data.drop(['Age at visit','Visit start date','First listed diagnosis icd10 subcategory','Second listed diagnosis icd10 subcategory',
                            'Stop code','Agentorangeflag','Combatflag','Ionizingradiationflag','Serviceconnectedflag','Swasiaconditionsflag','State'], axis=1,inplace=True)

In [33]:
outpatient_test_data = outpatient_test_data

In [34]:
outpatient_test_data

Unnamed: 0,Internalpatientid,outpatients_First_listed_diagnosis_icd10_subcategory,outpatients_Second_listed_diagnosis_icd10_subcategory
0,7,74_Dental caries on smooth surface,74_Not specified
1,25,70_Encounter for other specified aftercare,70_Unspecified atrial fibrillation and atrial ...
2,31,"69_Counseling, unspecified",69_Not specified
3,41,64_Malignant neoplasm of lower third of esophagus,64_Secondary malignant neoplasm of liver and i...
4,41,"64_Counseling, unspecified",64_Not specified
...,...,...,...
29729,169037,88_Chronic obstructive pulmonary disease with ...,88_Not specified
29730,169037,88_Other specified problems related to psychos...,88_Encounter for palliative care
29731,169037,88_Dementia in other diseases classified elsew...,88_Encounter for palliative care
29732,169037,88_Encounter for palliative care,88_Dementia in other diseases classified elsew...


In [35]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = outpatient_test_data.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_outpatients = df_grouped.reset_index()
df_grouped_outpatients

Unnamed: 0,Internalpatientid,outpatients_First_listed_diagnosis_icd10_subcategory,outpatients_Second_listed_diagnosis_icd10_subcategory
0,7,74_Dental caries on smooth surface,74_Not specified
1,25,70_Encounter for other specified aftercare,70_Unspecified atrial fibrillation and atrial ...
2,31,"69_Counseling, unspecified",69_Not specified
3,41,64_Malignant neoplasm of lower third of esopha...,64_Secondary malignant neoplasm of liver and i...
4,42,68_Encounter for screening for cardiovascular ...,"68_Not specified,68_Not specified,68_Encounter..."
...,...,...,...
18096,169000,80_Person consulting for explanation of examin...,80_Not specified
18097,169011,"75_Heart failure, unspecified",75_Atherosclerotic heart disease of native cor...
18098,169020,64_COVID-19,64_Not specified
18099,169037,88_Other forms of chronic ischemic heart disea...,"88_Chronic obstructive pulmonary disease, unsp..."


# Saving the CSV file

In [36]:
df_grouped_outpatients.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_test/df_outpatients_test.csv')

In [102]:
df_grouped_outpatients['outpatients_First_listed_diagnosis_icd10_subcategory'].values[1]

'69_Encounter for screening for other specified diseases and disorders,69_Encounter for screening for other specified diseases and disorders'

In [34]:
import os
cwd = os.getcwd()
cwd

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/2211575'