In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

In [3]:
#['data_team3_synthetic_train']
datastore = workspace.datastores['data_team3_synthetic_train'] 

In [4]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'outpatient_visits_train.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [62]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
outpatient_train_data = dataset.to_pandas_dataframe()

In [64]:
outpatient_train_data.Internalpatientid.nunique()

133252

# Elimination of the unnamed columns

In [65]:
# Remove unwanted columns 
outpatient_train_data.drop('Column1',axis=1,inplace=True)

In [66]:
outpatient_train_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
0,1,60.619995,2004-11-17 14:35:36,Not specified,Not specified,TELEPHONE/MEDICINE,,,,False,,Indiana
1,1,61.032110,2005-04-17 05:40:07,Not specified,Not specified,TELEPHONE TRIAGE,,,,True,,Indiana
2,1,61.907075,2006-03-03 00:51:52,Not specified,Not specified,LABORATORY,,,,,,Indiana
3,1,62.296573,2006-07-23 09:32:16,Not specified,Not specified,NO-COUNT,,,,,,Indiana
4,1,63.632066,2007-11-23 12:28:48,Not specified,Not specified,GENERAL INTERNAL MEDICINE,,,,False,,Indiana
...,...,...,...,...,...,...,...,...,...,...,...,...
82692715,99999,96.300251,2013-03-19 17:47:55,Not specified,Not specified,TELEPHONE/ANCILLARY,,,,False,,Arizona
82692716,99999,96.322523,2013-03-27 21:10:33,Not specified,Not specified,NEUROLOGY,,,,False,,Arizona
82692717,99999,96.327503,2013-03-29 16:51:35,Not specified,Not specified,RENAL/NEPHROL(EXCEPT DIALYSIS),,,,True,,Arizona
82692718,99999,96.360092,2013-04-10 14:43:53,Not specified,Not specified,SPEECH-LANGUAGE PATHOLOGY,,,,,,Arizona


# sorting out patients id and age column

In [67]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
outpatient_train_data.sort_values(["Internalpatientid","Age at visit"],inplace=True)

In [68]:
outpatient_train_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
278047,1,57.573116,2001-10-30 23:22:22,Not specified,Not specified,PRIMARY CARE/MEDICINE,,,,True,,Indiana
791,1,57.840497,2002-02-05 16:50:30,Not specified,Not specified,LABORATORY,,,,,,Indiana
39923,1,57.912086,2002-03-03 20:49:01,Not specified,Not specified,LABORATORY,,,,,,Indiana
22545,1,57.912177,2002-03-03 21:37:12,Not specified,Not specified,PRIMARY CARE/MEDICINE,,,,True,,Indiana
792,1,57.975498,2002-03-27 01:04:21,Not specified,Not specified,DERMATOLOGY,,,,False,,Indiana
...,...,...,...,...,...,...,...,...,...,...,...,...
29931363,169064,87.927745,2014-11-22 14:10:56,Not specified,Not specified,ADMIN PAT ACTIVTIES (MASNONCT),,,,,,Wyoming
29876085,169064,87.927991,2014-11-22 16:20:05,Not specified,Not specified,ADMIN PAT ACTIVTIES (MASNONCT),,,,,,Wyoming
59674056,169064,87.928316,2014-11-22 19:11:13,Not specified,Not specified,ADMIN PAT ACTIVTIES (MASNONCT),,,,,,Wyoming
59427681,169064,87.933827,2014-11-24 19:31:49,Not specified,Not specified,ADMIN PAT ACTIVTIES (MASNONCT),,,,,,Wyoming


# checking the missing valuses

In [69]:
print("set missing values:\n", outpatient_train_data.isna().sum())

set missing values:
 Internalpatientid                                   0
Age at visit                                        0
Visit start date                                    0
First listed diagnosis icd10 subcategory            0
Second listed diagnosis icd10 subcategory           0
Stop code                                           0
Agentorangeflag                              79641724
Combatflag                                   82560329
Ionizingradiationflag                        82593660
Serviceconnectedflag                         63719525
Swasiaconditionsflag                         82416510
State                                               0
dtype: int64


* There is no missing value in the potential attributes,but lot of "Not specified" values in PA

In [70]:
#Checking the values each category in condition type

# Count the occurrences of each label
first_list = outpatient_train_data["First listed diagnosis icd10 subcategory"].value_counts().loc[lambda x: x>200]
first_list

Not specified                                                                  70603206
Other specified counseling                                                       720331
Encounter for other specified aftercare                                          539853
Counseling, unspecified                                                          345595
Essential (primary) hypertension                                                 313963
                                                                                 ...   
Pemphigus, unspecified                                                              201
Diverticulitis of intestine, part unspecified, with perforation and abscess         201
Mechanical complication of urinary catheter                                         201
Diverticular disease of small intestine without perforation or abscess              201
Non-follicular (diffuse) lymphoma, unspecified                                      201
Name: First listed diagnosis icd

In [13]:
#Checking the values each category in condition type

# Count the occurrences of each label
sec_list = outpatient_train_data["Second listed diagnosis icd10 subcategory"].value_counts().loc[lambda x: x>200]
sec_list

Not specified                                                                          78185197
Essential (primary) hypertension                                                         207434
Unspecified atrial fibrillation and atrial flutter                                       192069
Long term (current) use of anticoagulants and antithrombotics/antiplatelets              142482
Type 2 diabetes mellitus without complications                                           105136
                                                                                         ...   
Pressure ulcer of contiguous site of back, buttock and hip                                  201
Other and unspecified adhesions and disruptions of iris and ciliary body                    201
Abnormal findings on diagnostic imaging of skull and head, not elsewhere classified         201
Military activity                                                                           201
Other symptoms and signs concerning food

In [71]:
# Remove the rows where both columns contain "Not Specified"

outpatient_train_data = outpatient_train_data.drop(outpatient_train_data[(outpatient_train_data['First listed diagnosis icd10 subcategory'] == 'Not specified') & (outpatient_train_data['Second listed diagnosis icd10 subcategory'] == 'Not specified')].index)

In [72]:
outpatient_train_data.Internalpatientid.nunique()

68716

* **after removing "Not specified" will get the 12120949**  

# Taking the digit in age coulmn

In [73]:
# Format 'Age at measurement' values in the format 00.00
outpatient_train_data["Age at visit"] = outpatient_train_data["Age at visit"].map("{:.2f}".format)

In [74]:
outpatient_train_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
37443,1,73.51,2017-10-11 06:59:59,Encounter for general adult medical examination,Not specified,LABORATORY,,,,True,,Indiana
11723,1,73.52,2017-10-16 07:57:07,Personal history of malignant neoplasm of urin...,Not specified,CYSTO ROOM IN UROLOGY CL,,,,False,,Indiana
49,1,73.54,2017-10-22 09:33:20,Long term (current) use of anticoagulants and ...,Unspecified atrial fibrillation and atrial flu...,CLINICAL PHARMACY,,,,False,,Indiana
37444,1,73.56,2017-10-30 01:40:07,Sleep apnea,Not specified,SLEEP MEDICINE,,,,False,,Indiana
1672,1,73.56,2017-10-30 02:00:25,Encounter for immunization,Not specified,GENERAL INTERNAL MEDICINE,,,,False,,Indiana
...,...,...,...,...,...,...,...,...,...,...,...,...
29876028,169061,79.98,2022-02-16 08:54:51,Type 2 diabetes mellitus with other specified ...,Not specified,TELEPHONE PRIMARY CARE,False,,,False,,Georgia
59256791,169061,80.03,2022-03-06 16:44:20,Other specified diabetes mellitus without comp...,Not specified,TELEPHONE PRIMARY CARE,False,,,False,,Georgia
29956397,169061,80.59,2022-09-28 02:18:06,Other specified counseling,Not specified,TELEPHONE PRIMARY CARE,False,,,False,,Georgia
59087808,169061,81.14,2023-04-15 23:35:37,Systolic (congestive) heart failure,Encounter for issue of repeat prescription,TELEPHONE PRIMARY CARE,False,,,False,,Georgia


In [75]:
# Convert 'Age' column from object to float
outpatient_train_data["Age at visit"] = outpatient_train_data["Age at visit"].astype(float)

# Max age

In [76]:
# Find the maximum age for each internal patient id
max_ages = outpatient_train_data.groupby('Internalpatientid')['Age at visit'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
outpatient_train_data = pd.merge(outpatient_train_data, max_ages, on =['Internalpatientid','Age at visit'], how = 'inner')

outpatient_train_data

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
0,1,80.21,2024-06-25 02:44:48,Other specified inflammatory liver diseases,Type 2 diabetes mellitus without complications,HEPATOLOGY CLINIC,,,,False,,Indiana
1,1,80.21,2024-06-26 03:20:15,Unspecified atrial fibrillation and atrial flu...,Not specified,TELEPHONE/ANCILLARY,,,,False,,Indiana
2,2,69.48,2024-03-17 12:47:16,Encounter for screening for other specified di...,Not specified,TELEPHONE MH,,,,False,,Texas
3,2,69.48,2024-03-18 07:01:21,Encounter for screening for other specified di...,Not specified,MENTAL HEALTH CLINIC - IND,,,,False,,Texas
4,4,84.19,2015-02-22 21:34:14,Unspecified abdominal pain,Not specified,SOCIAL WORK SERVICE,,,,,,Puerto Rico
...,...,...,...,...,...,...,...,...,...,...,...,...
112849,169057,86.14,2025-06-17 22:36:34,Ischemic cardiomyopathy,Venous insufficiency (chronic) (peripheral),CARDIOLOGY,,,,True,,California
112850,169057,86.14,2025-06-18 23:18:37,Type 2 diabetes mellitus with unspecified comp...,Venous insufficiency (chronic) (peripheral),PODIATRY,,,,True,,California
112851,169057,86.14,2025-06-19 22:02:08,Presence of cardiac pacemaker,Not specified,KINESIOTHERAPY,,,,False,,California
112852,169060,71.78,2017-09-25 05:57:56,Encounter for other specified aftercare,Not specified,TELEPHONE PRIMARY CARE,,,,,,California


# Rounding off the Age 

In [77]:
outpatient_train_data['Age at visit'] = outpatient_train_data['Age at visit'].apply(lambda x: round(x,))

In [88]:
outpatient_train_data.Internalpatientid.nunique()

68716

In [79]:
df = outpatient_train_data["Internalpatientid"].value_counts().loc[lambda x: x>25].to_frame()

In [81]:
df = df.reset_index()

In [83]:
df.columns = ["Internalpatientid","count_outpatients"]

In [85]:
len(df)

33

In [86]:
df.drop(["count_outpatients"],axis=1,inplace =True)

In [87]:
df

Unnamed: 0,Internalpatientid
0,75325
1,163132
2,59499
3,123727
4,151859
5,46626
6,161451
7,135209
8,93348
9,14924


In [89]:
# Merge the two dataframes based on 'Internalpatientid'
merged_df = outpatient_train_data.merge(df, on='Internalpatientid', how='left')

# Filter rows from DataFrame 1 that have matching values in DataFrame 2
filtered_df = merged_df[~merged_df['Internalpatientid'].isin(df['Internalpatientid'])]

# Print only the columns of DataFrame 1
filtered_df

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,State
0,1,80,2024-06-25 02:44:48,Other specified inflammatory liver diseases,Type 2 diabetes mellitus without complications,HEPATOLOGY CLINIC,,,,False,,Indiana
1,1,80,2024-06-26 03:20:15,Unspecified atrial fibrillation and atrial flu...,Not specified,TELEPHONE/ANCILLARY,,,,False,,Indiana
2,2,69,2024-03-17 12:47:16,Encounter for screening for other specified di...,Not specified,TELEPHONE MH,,,,False,,Texas
3,2,69,2024-03-18 07:01:21,Encounter for screening for other specified di...,Not specified,MENTAL HEALTH CLINIC - IND,,,,False,,Texas
4,4,84,2015-02-22 21:34:14,Unspecified abdominal pain,Not specified,SOCIAL WORK SERVICE,,,,,,Puerto Rico
...,...,...,...,...,...,...,...,...,...,...,...,...
112849,169057,86,2025-06-17 22:36:34,Ischemic cardiomyopathy,Venous insufficiency (chronic) (peripheral),CARDIOLOGY,,,,True,,California
112850,169057,86,2025-06-18 23:18:37,Type 2 diabetes mellitus with unspecified comp...,Venous insufficiency (chronic) (peripheral),PODIATRY,,,,True,,California
112851,169057,86,2025-06-19 22:02:08,Presence of cardiac pacemaker,Not specified,KINESIOTHERAPY,,,,False,,California
112852,169060,72,2017-09-25 05:57:56,Encounter for other specified aftercare,Not specified,TELEPHONE PRIMARY CARE,,,,,,California


In [90]:
filtered_df.Internalpatientid.nunique()

68683

In [91]:
outpatient_train_data = filtered_df  # Assinging new varible

# Adding  'Age at visit' & 'First listed diagnosis icd10 subcategory' columns with '_'

In [93]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
outpatient_train_data['outpatients_First_listed_diagnosis_icd10_subcategory'] =  outpatient_train_data['Age at visit'].astype(str) + '_' +  outpatient_train_data['First listed diagnosis icd10 subcategory']

# Adding  'Age at visit' & 'Second listed diagnosis icd10 subcategory' columns with '_'

In [94]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
outpatient_train_data['outpatients_Second_listed_diagnosis_icd10_subcategory'] =  outpatient_train_data['Age at visit'].astype(str) + '_' +  outpatient_train_data['Second listed diagnosis icd10 subcategory']

# Dropping all unnecessary columns

In [96]:
# Remove columns 
outpatient_train_data.drop(['Age at visit','Visit start date','First listed diagnosis icd10 subcategory','Second listed diagnosis icd10 subcategory',
                            'Stop code','Agentorangeflag','Combatflag','Ionizingradiationflag','Serviceconnectedflag','Swasiaconditionsflag','State'], axis=1,inplace=True)

In [97]:
outpatient_train_data = outpatient_train_data

In [98]:
outpatient_train_data

Unnamed: 0,Internalpatientid,outpatients_First_listed_diagnosis_icd10_subcategory,outpatients_Second_listed_diagnosis_icd10_subcategory
0,1,80_Other specified inflammatory liver diseases,80_Type 2 diabetes mellitus without complications
1,1,80_Unspecified atrial fibrillation and atrial ...,80_Not specified
2,2,69_Encounter for screening for other specified...,69_Not specified
3,2,69_Encounter for screening for other specified...,69_Not specified
4,4,84_Unspecified abdominal pain,84_Not specified
...,...,...,...
112849,169057,86_Ischemic cardiomyopathy,86_Venous insufficiency (chronic) (peripheral)
112850,169057,86_Type 2 diabetes mellitus with unspecified c...,86_Venous insufficiency (chronic) (peripheral)
112851,169057,86_Presence of cardiac pacemaker,86_Not specified
112852,169060,72_Encounter for other specified aftercare,72_Not specified


In [99]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = outpatient_train_data.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_outpatients = df_grouped.reset_index()
df_grouped_outpatients

Unnamed: 0,Internalpatientid,outpatients_First_listed_diagnosis_icd10_subcategory,outpatients_Second_listed_diagnosis_icd10_subcategory
0,1,80_Other specified inflammatory liver diseases...,80_Type 2 diabetes mellitus without complicati...
1,2,69_Encounter for screening for other specified...,"69_Not specified,69_Not specified"
2,4,"84_Unspecified abdominal pain,84_Problems rela...","84_Not specified,84_Not specified,84_Not speci..."
3,8,"69_Essential (primary) hypertension,69_Essenti...","69_Not specified,69_Not specified"
4,10,72_Sleep apnea,72_Not specified
...,...,...,...
68678,169053,"81_Abnormal weight loss,81_Delirium due to kno...","81_Encounter for palliative care,81_Not specif..."
68679,169056,"88_Peripheral vascular disease, unspecified","88_Hyperlipidemia, unspecified"
68680,169057,"86_Presence of cardiac pacemaker,86_Ischemic c...","86_Not specified,86_Venous insufficiency (chro..."
68681,169060,72_Encounter for other specified aftercare,72_Not specified


# Saving the CSV file

In [100]:
df_grouped_outpatients.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_train/df_outpatients_train.csv')

In [102]:
df_grouped_outpatients['outpatients_First_listed_diagnosis_icd10_subcategory'].values[1]

'69_Encounter for screening for other specified diseases and disorders,69_Encounter for screening for other specified diseases and disorders'

In [34]:
import os
cwd = os.getcwd()
cwd

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/2211575'