# **Azure notebook Setup**

In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [3]:
#['data_team3_synthetic_train']
datastore = workspace.datastores['data_team3_synthetic_test'] 

In [4]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'procedures_test.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [5]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
procedures_test_data = dataset.to_pandas_dataframe()

In [6]:
procedures_test_data

Unnamed: 0,Column1,Internalpatientid,Age at procedure,Procedure date,Procedure code,Procedure code description,State
0,112,100,52.559428,2014-04-19 11:18:05,99343,HOME VISIT FOR THE EVALUATION AND MANAGEMENT O...,New York
1,113,100,52.559428,2014-04-19 11:18:05,G0154,DIRECT SKILLED NURSING SERVICES OF A LICENSED ...,New York
2,114,100,52.753358,2014-06-29 08:27:24,97003,OCCUPATIONAL THERAPY EVALUATION,New York
3,115,100,52.753358,2014-06-29 08:27:24,97535,"SELF-CARE/HOME MANAGEMENT TRAINING (EG, ACTIVI...",New York
4,116,100,53.094739,2014-11-01 03:03:17,G0155,SERVICES OF CLINICAL SOCIAL WORKER IN HOME HEA...,New York
...,...,...,...,...,...,...,...
27185595,130980476,99997,91.227044,2020-05-15 11:37:59,81001,"URINALYSIS, BY DIP STICK OR TABLET REAGENT FOR...",North Carolina
27185596,130980477,99997,91.227044,2020-05-15 11:37:59,82728,FERRITIN,North Carolina
27185597,130980478,99997,91.227044,2020-05-15 11:37:59,85025,"BLOOD COUNT; COMPLETE (CBC), AUTOMATED (HGB, H...",North Carolina
27185598,130980479,99997,91.227044,2020-05-15 11:37:59,82746,FOLIC ACID; SERUM,North Carolina


# Importing the necessary library

In [7]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})


# Set the maximum number of rows to be displayed to 1000
pd.set_option('display.max_rows', 1163)

# Elimination of the unnamed columns

In [8]:
# Remove unwanted columns 
procedures_test_data.drop('Column1',axis=1,inplace=True)

In [9]:
df_procedures = procedures_test_data

In [10]:
df_procedures

Unnamed: 0,Internalpatientid,Age at procedure,Procedure date,Procedure code,Procedure code description,State
0,100,52.559428,2014-04-19 11:18:05,99343,HOME VISIT FOR THE EVALUATION AND MANAGEMENT O...,New York
1,100,52.559428,2014-04-19 11:18:05,G0154,DIRECT SKILLED NURSING SERVICES OF A LICENSED ...,New York
2,100,52.753358,2014-06-29 08:27:24,97003,OCCUPATIONAL THERAPY EVALUATION,New York
3,100,52.753358,2014-06-29 08:27:24,97535,"SELF-CARE/HOME MANAGEMENT TRAINING (EG, ACTIVI...",New York
4,100,53.094739,2014-11-01 03:03:17,G0155,SERVICES OF CLINICAL SOCIAL WORKER IN HOME HEA...,New York
...,...,...,...,...,...,...
27185595,99997,91.227044,2020-05-15 11:37:59,81001,"URINALYSIS, BY DIP STICK OR TABLET REAGENT FOR...",North Carolina
27185596,99997,91.227044,2020-05-15 11:37:59,82728,FERRITIN,North Carolina
27185597,99997,91.227044,2020-05-15 11:37:59,85025,"BLOOD COUNT; COMPLETE (CBC), AUTOMATED (HGB, H...",North Carolina
27185598,99997,91.227044,2020-05-15 11:37:59,82746,FOLIC ACID; SERUM,North Carolina


# sorting out patients id and age column

In [11]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
df_procedures.sort_values(["Internalpatientid","Age at procedure"],inplace=True)

In [12]:
df_procedures

Unnamed: 0,Internalpatientid,Age at procedure,Procedure date,Procedure code,Procedure code description,State
17952646,6,74.147397,2000-10-29 20:58:06,90718,TETANUS AND DIPHTHERIA TOXOIDS (TD) ADSORBED W...,Texas
17914335,6,74.228196,2000-11-28 09:43:53,90718,TETANUS AND DIPHTHERIA TOXOIDS (TD) ADSORBED W...,Texas
25708445,6,74.767093,2001-06-13 08:56:37,99203,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Texas
18044415,6,74.767095,2001-06-13 08:57:38,99203,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Texas
17979814,6,74.842250,2001-07-10 20:12:49,85999,UNLISTED HEMATOLOGY AND COAGULATION PROCEDURE,Texas
...,...,...,...,...,...,...
11197211,169065,53.316365,2011-06-10 03:09:03,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
11197212,169065,53.316936,2011-06-10 08:09:33,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
11195318,169065,53.320380,2011-06-11 14:22:18,99499,UNLISTED EVALUATION AND MANAGEMENT SERVICE,Arizona
11195319,169065,53.320380,2011-06-11 14:22:18,99211,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Arizona


# Checking the missing value

In [13]:
print("Training set missing values:\n", df_procedures.isna().sum()) 

Training set missing values:
 Internalpatientid             0
Age at procedure              0
Procedure date                0
Procedure code                0
Procedure code description    0
State                         0
dtype: int64


* * **The "Procedure code" column has 4 missing values.**

* **Note:Here potiential attributes is "Procedure code description" ahs no missing value**

# Taking digit in age coulmn

In [14]:
# Format 'Age at measurement' values in the format 00.0
df_procedures["Age at procedure"] = df_procedures["Age at procedure"].map("{:.2f}".format)

In [15]:
df_procedures

Unnamed: 0,Internalpatientid,Age at procedure,Procedure date,Procedure code,Procedure code description,State
17952646,6,74.15,2000-10-29 20:58:06,90718,TETANUS AND DIPHTHERIA TOXOIDS (TD) ADSORBED W...,Texas
17914335,6,74.23,2000-11-28 09:43:53,90718,TETANUS AND DIPHTHERIA TOXOIDS (TD) ADSORBED W...,Texas
25708445,6,74.77,2001-06-13 08:56:37,99203,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Texas
18044415,6,74.77,2001-06-13 08:57:38,99203,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Texas
17979814,6,74.84,2001-07-10 20:12:49,85999,UNLISTED HEMATOLOGY AND COAGULATION PROCEDURE,Texas
...,...,...,...,...,...,...
11197211,169065,53.32,2011-06-10 03:09:03,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
11197212,169065,53.32,2011-06-10 08:09:33,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
11195318,169065,53.32,2011-06-11 14:22:18,99499,UNLISTED EVALUATION AND MANAGEMENT SERVICE,Arizona
11195319,169065,53.32,2011-06-11 14:22:18,99211,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Arizona


In [16]:
# Convert 'Age at measurement' column from object to float
df_procedures ["Age at procedure"] = df_procedures ["Age at procedure"].astype(float)

In [17]:
df_procedures

Unnamed: 0,Internalpatientid,Age at procedure,Procedure date,Procedure code,Procedure code description,State
17952646,6,74.15,2000-10-29 20:58:06,90718,TETANUS AND DIPHTHERIA TOXOIDS (TD) ADSORBED W...,Texas
17914335,6,74.23,2000-11-28 09:43:53,90718,TETANUS AND DIPHTHERIA TOXOIDS (TD) ADSORBED W...,Texas
25708445,6,74.77,2001-06-13 08:56:37,99203,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Texas
18044415,6,74.77,2001-06-13 08:57:38,99203,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Texas
17979814,6,74.84,2001-07-10 20:12:49,85999,UNLISTED HEMATOLOGY AND COAGULATION PROCEDURE,Texas
...,...,...,...,...,...,...
11197211,169065,53.32,2011-06-10 03:09:03,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
11197212,169065,53.32,2011-06-10 08:09:33,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
11195318,169065,53.32,2011-06-11 14:22:18,99499,UNLISTED EVALUATION AND MANAGEMENT SERVICE,Arizona
11195319,169065,53.32,2011-06-11 14:22:18,99211,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Arizona


# Max age

In [18]:
# Find the maximum age for each internal patient id
max_ages = df_procedures.groupby('Internalpatientid')['Age at procedure'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
df_procedures = pd.merge(df_procedures, max_ages, on =['Internalpatientid','Age at procedure'], how = 'inner')

df_procedures

Unnamed: 0,Internalpatientid,Age at procedure,Procedure date,Procedure code,Procedure code description,State
0,6,88.10,2014-10-16 01:23:22,98966,TELEPHONE ASSESSMENT AND MANAGEMENT SERVICE PR...,Texas
1,7,74.37,2020-11-22 12:50:02,D1206,TOPICAL FLUORIDE VARNISH; THERAPEUTIC APPLICAT...,Nebraska
2,7,74.37,2020-11-22 12:50:02,D2331,"RESIN-TWO SURFACES, ANTERIOR",Nebraska
3,9,51.88,2000-04-25 20:51:31,37.78,INSERTION OF TEMPORARY TRANSVENOUS PACEMAKER S...,Texas
4,9,51.88,2000-04-25 20:51:31,96.05,OTHER INTUBATION OF RESPIRATORY TRACT,Texas
...,...,...,...,...,...,...
130057,169065,53.32,2011-06-10 03:09:03,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
130058,169065,53.32,2011-06-10 08:09:33,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
130059,169065,53.32,2011-06-11 14:22:18,99499,UNLISTED EVALUATION AND MANAGEMENT SERVICE,Arizona
130060,169065,53.32,2011-06-11 14:22:18,99211,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Arizona


# Rounding off the Age 

In [19]:
df_procedures ['Age at procedure'] = df_procedures['Age at procedure'].apply(lambda x: round(x,))

In [20]:
df_procedures["Internalpatientid"].nunique()

34812

In [21]:
df_procedures

Unnamed: 0,Internalpatientid,Age at procedure,Procedure date,Procedure code,Procedure code description,State
0,6,88,2014-10-16 01:23:22,98966,TELEPHONE ASSESSMENT AND MANAGEMENT SERVICE PR...,Texas
1,7,74,2020-11-22 12:50:02,D1206,TOPICAL FLUORIDE VARNISH; THERAPEUTIC APPLICAT...,Nebraska
2,7,74,2020-11-22 12:50:02,D2331,"RESIN-TWO SURFACES, ANTERIOR",Nebraska
3,9,52,2000-04-25 20:51:31,37.78,INSERTION OF TEMPORARY TRANSVENOUS PACEMAKER S...,Texas
4,9,52,2000-04-25 20:51:31,96.05,OTHER INTUBATION OF RESPIRATORY TRACT,Texas
...,...,...,...,...,...,...
130057,169065,53,2011-06-10 03:09:03,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
130058,169065,53,2011-06-10 08:09:33,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
130059,169065,53,2011-06-11 14:22:18,99499,UNLISTED EVALUATION AND MANAGEMENT SERVICE,Arizona
130060,169065,53,2011-06-11 14:22:18,99211,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Arizona


In [22]:
# value counts
df3 = df_procedures["Internalpatientid"].value_counts().loc[lambda x: x>30].to_frame()

In [23]:
df3 = df3.reset_index()

In [24]:
df3.columns = ["Internalpatientid","count_procedures"]

In [25]:
df3.drop(["count_procedures"],axis=1,inplace =True)

In [28]:
len(df3)

340

In [29]:
# Merge the two dataframes based on 'Internalpatientid'
merged_df = df_procedures.merge(df3, on='Internalpatientid', how='left')

# Filter rows from DataFrame 1 that have matching values in DataFrame 2
filtered_df = merged_df[~merged_df['Internalpatientid'].isin(df3['Internalpatientid'])]

# Print only the columns of DataFrame 1
filtered_df

Unnamed: 0,Internalpatientid,Age at procedure,Procedure date,Procedure code,Procedure code description,State
0,6,88,2014-10-16 01:23:22,98966,TELEPHONE ASSESSMENT AND MANAGEMENT SERVICE PR...,Texas
1,7,74,2020-11-22 12:50:02,D1206,TOPICAL FLUORIDE VARNISH; THERAPEUTIC APPLICAT...,Nebraska
2,7,74,2020-11-22 12:50:02,D2331,"RESIN-TWO SURFACES, ANTERIOR",Nebraska
3,9,52,2000-04-25 20:51:31,37.78,INSERTION OF TEMPORARY TRANSVENOUS PACEMAKER S...,Texas
4,9,52,2000-04-25 20:51:31,96.05,OTHER INTUBATION OF RESPIRATORY TRACT,Texas
...,...,...,...,...,...,...
130057,169065,53,2011-06-10 03:09:03,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
130058,169065,53,2011-06-10 08:09:33,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
130059,169065,53,2011-06-11 14:22:18,99499,UNLISTED EVALUATION AND MANAGEMENT SERVICE,Arizona
130060,169065,53,2011-06-11 14:22:18,99211,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Arizona


In [30]:
filtered_df.Internalpatientid.nunique() 

34472

In [31]:
df_procedures = filtered_df

In [32]:
df_procedures

Unnamed: 0,Internalpatientid,Age at procedure,Procedure date,Procedure code,Procedure code description,State
0,6,88,2014-10-16 01:23:22,98966,TELEPHONE ASSESSMENT AND MANAGEMENT SERVICE PR...,Texas
1,7,74,2020-11-22 12:50:02,D1206,TOPICAL FLUORIDE VARNISH; THERAPEUTIC APPLICAT...,Nebraska
2,7,74,2020-11-22 12:50:02,D2331,"RESIN-TWO SURFACES, ANTERIOR",Nebraska
3,9,52,2000-04-25 20:51:31,37.78,INSERTION OF TEMPORARY TRANSVENOUS PACEMAKER S...,Texas
4,9,52,2000-04-25 20:51:31,96.05,OTHER INTUBATION OF RESPIRATORY TRACT,Texas
...,...,...,...,...,...,...
130057,169065,53,2011-06-10 03:09:03,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
130058,169065,53,2011-06-10 08:09:33,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona
130059,169065,53,2011-06-11 14:22:18,99499,UNLISTED EVALUATION AND MANAGEMENT SERVICE,Arizona
130060,169065,53,2011-06-11 14:22:18,99211,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Arizona


# Adding  'Age at procedure' & 'Procedure code description' columns with '_'

In [33]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
df_procedures['procedures_code_description'] = df_procedures['Age at procedure'].astype(str) + '_' +  df_procedures['Procedure code description']

In [34]:
df_procedures

Unnamed: 0,Internalpatientid,Age at procedure,Procedure date,Procedure code,Procedure code description,State,procedures_code_description
0,6,88,2014-10-16 01:23:22,98966,TELEPHONE ASSESSMENT AND MANAGEMENT SERVICE PR...,Texas,88_TELEPHONE ASSESSMENT AND MANAGEMENT SERVICE...
1,7,74,2020-11-22 12:50:02,D1206,TOPICAL FLUORIDE VARNISH; THERAPEUTIC APPLICAT...,Nebraska,74_TOPICAL FLUORIDE VARNISH; THERAPEUTIC APPLI...
2,7,74,2020-11-22 12:50:02,D2331,"RESIN-TWO SURFACES, ANTERIOR",Nebraska,"74_RESIN-TWO SURFACES, ANTERIOR"
3,9,52,2000-04-25 20:51:31,37.78,INSERTION OF TEMPORARY TRANSVENOUS PACEMAKER S...,Texas,52_INSERTION OF TEMPORARY TRANSVENOUS PACEMAKE...
4,9,52,2000-04-25 20:51:31,96.05,OTHER INTUBATION OF RESPIRATORY TRACT,Texas,52_OTHER INTUBATION OF RESPIRATORY TRACT
...,...,...,...,...,...,...,...
130057,169065,53,2011-06-10 03:09:03,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona,"53_RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW,..."
130058,169065,53,2011-06-10 08:09:33,71010,"RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FR...",Arizona,"53_RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW,..."
130059,169065,53,2011-06-11 14:22:18,99499,UNLISTED EVALUATION AND MANAGEMENT SERVICE,Arizona,53_UNLISTED EVALUATION AND MANAGEMENT SERVICE
130060,169065,53,2011-06-11 14:22:18,99211,OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALU...,Arizona,53_OFFICE OR OTHER OUTPATIENT VISIT FOR THE EV...


# Dropping all unnecessary columns

In [35]:
# Remove columns 
df_procedures.drop(["Age at procedure","Procedure date","Procedure code",
                         "Procedure code description","State"], axis=1,inplace=True)  

In [36]:
df_procedures = df_procedures

In [37]:
df_procedures

Unnamed: 0,Internalpatientid,procedures_code_description
0,6,88_TELEPHONE ASSESSMENT AND MANAGEMENT SERVICE...
1,7,74_TOPICAL FLUORIDE VARNISH; THERAPEUTIC APPLI...
2,7,"74_RESIN-TWO SURFACES, ANTERIOR"
3,9,52_INSERTION OF TEMPORARY TRANSVENOUS PACEMAKE...
4,9,52_OTHER INTUBATION OF RESPIRATORY TRACT
...,...,...
130057,169065,"53_RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW,..."
130058,169065,"53_RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW,..."
130059,169065,53_UNLISTED EVALUATION AND MANAGEMENT SERVICE
130060,169065,53_OFFICE OR OTHER OUTPATIENT VISIT FOR THE EV...


In [38]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = df_procedures.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_procedures = df_grouped.reset_index()
df_grouped_procedures

Unnamed: 0,Internalpatientid,procedures_code_description
0,6,88_TELEPHONE ASSESSMENT AND MANAGEMENT SERVICE...
1,7,74_TOPICAL FLUORIDE VARNISH; THERAPEUTIC APPLI...
2,9,52_INSERTION OF TEMPORARY TRANSVENOUS PACEMAKE...
3,12,74_LIPID PANELTHIS PANEL MUST INCLUDE THE FOLL...
4,17,82_TREATMENT OF SWALLOWING DYSFUNCTION AND/OR ...
...,...,...
34467,169037,88_TELEPHONE ASSESSMENT AND MANAGEMENT SERVICE...
34468,169045,98_EDUCATION AND TRAINING FOR PATIENT SELF-MAN...
34469,169058,79_OFFICE OR OTHER OUTPATIENT VISIT FOR THE EV...
34470,169059,91_TELEPHONE ASSESSMENT AND MANAGEMENT SERVICE...


# Saving the csv file

In [39]:
df_grouped_procedures.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_test/df_procedures_test.csv')

In [41]:
df_grouped_procedures['procedures_code_description'].values[34471]

'53_RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FRONTAL,53_RADIOLOGIC EXAMINATION, CHEST; SINGLE VIEW, FRONTAL,53_UNLISTED EVALUATION AND MANAGEMENT SERVICE,53_OFFICE OR OTHER OUTPATIENT VISIT FOR THE EVALUATION AND MANAGEMENT OF ANESTABLISHED PATIENT, THAT MAY NOT REQUIRE THE PRESENCE OF A PHYSICIAN OR OTHERQUALIFIED HEALTH CARE PROFESSIONAL. USUALLY, THE PRESENTING PROBLEM(S) AREMINIMAL.,53_UNLISTED EVALUATION AND MANAGEMENT SERVICE'