In [35]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [36]:
#['data_team3_synthetic_train']
datastore = workspace.datastores['data_team3_synthetic_quality_check'] 

In [37]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'immunization_qual.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [38]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
immunizations_train = dataset.to_pandas_dataframe()

In [39]:
immunizations_train

Unnamed: 0,Column1,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
0,194,100229,68.021827,2000-09-25 23:27:52,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),Arizona
1,195,100229,68.021827,2000-09-25 23:27:52,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona
2,253,100314,64.986914,2011-07-04 17:12:36,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Missouri
3,302,100399,77.128884,2001-08-23 14:34:16,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Minnesota
4,303,100399,78.234635,2002-10-01 18:13:15,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Minnesota
...,...,...,...,...,...,...,...,...,...,...,...
13580,2346260,99143,90.632019,2012-01-15 00:04:27,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Florida
13581,2346516,9947,67.200942,2010-10-16 22:39:38,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Ohio
13582,2346517,9947,67.200942,2010-10-16 22:39:38,INFLUENZA (HISTORICAL),,,,88.0,Not specified (no value),Ohio
13583,2346878,99941,76.400998,2018-12-19 13:33:08,"INFLUENZA, SEASONAL, INJECTABLE",,,,141.0,Not specified (no value),Maine


In [40]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

# Set the maximum number of rows to be displayed to 1000
pd.set_option('display.max_rows', 1163)

# Elimination of the unnamed columns

In [41]:
# Remove unwanted columns 
immunizations_train.drop('Column1',axis=1,inplace=True)

In [42]:
immunizations_train = immunizations_train

In [43]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
0,100229,68.021827,2000-09-25 23:27:52,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),Arizona
1,100229,68.021827,2000-09-25 23:27:52,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona
2,100314,64.986914,2011-07-04 17:12:36,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Missouri
3,100399,77.128884,2001-08-23 14:34:16,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Minnesota
4,100399,78.234635,2002-10-01 18:13:15,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Minnesota
...,...,...,...,...,...,...,...,...,...,...
13580,99143,90.632019,2012-01-15 00:04:27,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Florida
13581,9947,67.200942,2010-10-16 22:39:38,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Ohio
13582,9947,67.200942,2010-10-16 22:39:38,INFLUENZA (HISTORICAL),,,,88.0,Not specified (no value),Ohio
13583,99941,76.400998,2018-12-19 13:33:08,"INFLUENZA, SEASONAL, INJECTABLE",,,,141.0,Not specified (no value),Maine


In [46]:
immunizations_train["Internalpatientid"].nunique()

939

# sorting out patients id and age column

In [47]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
immunizations_train.sort_values(["Internalpatientid","Age at immunization"],inplace=True)

In [48]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
12616,67,48.284719,2010-08-10 01:33:24,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
12617,67,48.284719,2010-08-10 01:33:24,"TDAP (Tetanus,Diphtheria,Pertussis) (HISTORICAL)",,,,115.0,Not specified (no value),California
9187,67,48.305034,2010-08-17 11:45:26,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),California
9051,67,49.301203,2011-08-16 14:09:21,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
8450,67,49.940158,2012-04-06 03:03:52,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
...,...,...,...,...,...,...,...,...,...,...
5347,168899,89.554915,2012-05-17 07:42:45,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Pennsylvania
5342,168899,90.568326,2013-05-22 17:21:10,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Pennsylvania
11202,168899,92.401948,2015-03-23 21:53:17,"FLU,3 YRS (HISTORICAL)",,,Non-VA Location,88.0,Not specified (no value),Pennsylvania
10848,168899,93.483974,2016-04-22 09:24:59,PNEUMOCOCCAL CONJUGATE PCV 13,,,,133.0,Not specified (no value),Pennsylvania


# Checking the missing value

In [49]:
print("Training set missing values:\n", immunizations_train.isna().sum())

Training set missing values:
 Internalpatientid             0
Age at immunization           0
Immunization date             0
Immunization                  0
Dose quantity             13558
Dose unit                 13558
Administered elsewhere    11640
Cvx code                    121
Series doses                  0
State                         0
dtype: int64


* **There are many missing values in the column "Dose quantity"(1836499),"Dose unit"(1836502),"Administered elsewhere"(1585089) "Cvx code"(121), and "Immunization" (3) And other varible have no missing values in this dataset.**

**Note:Here potiential attributes is Immunization, so it will have three mising value we can remove.**

# Removing the missing value

In [50]:
# Remove rows missing values in the "Immunization" column
immunizations_train.dropna(subset=["Immunization"],inplace=True)

In [51]:
immunizations_train = immunizations_train

In [52]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
12616,67,48.284719,2010-08-10 01:33:24,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
12617,67,48.284719,2010-08-10 01:33:24,"TDAP (Tetanus,Diphtheria,Pertussis) (HISTORICAL)",,,,115.0,Not specified (no value),California
9187,67,48.305034,2010-08-17 11:45:26,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),California
9051,67,49.301203,2011-08-16 14:09:21,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
8450,67,49.940158,2012-04-06 03:03:52,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
...,...,...,...,...,...,...,...,...,...,...
5347,168899,89.554915,2012-05-17 07:42:45,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Pennsylvania
5342,168899,90.568326,2013-05-22 17:21:10,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Pennsylvania
11202,168899,92.401948,2015-03-23 21:53:17,"FLU,3 YRS (HISTORICAL)",,,Non-VA Location,88.0,Not specified (no value),Pennsylvania
10848,168899,93.483974,2016-04-22 09:24:59,PNEUMOCOCCAL CONJUGATE PCV 13,,,,133.0,Not specified (no value),Pennsylvania


In [53]:
# Checking the missing value after removing 
print("Training set missing values:\n", immunizations_train.isna().sum())

Training set missing values:
 Internalpatientid             0
Age at immunization           0
Immunization date             0
Immunization                  0
Dose quantity             13558
Dose unit                 13558
Administered elsewhere    11640
Cvx code                    121
Series doses                  0
State                         0
dtype: int64


# Taking the one digit in age coulmn

In [54]:
# Format 'Age at measurement' values in the format 00.00
immunizations_train["Age at immunization"] = immunizations_train["Age at immunization"].map("{:.2f}".format)

In [55]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
12616,67,48.28,2010-08-10 01:33:24,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
12617,67,48.28,2010-08-10 01:33:24,"TDAP (Tetanus,Diphtheria,Pertussis) (HISTORICAL)",,,,115.0,Not specified (no value),California
9187,67,48.31,2010-08-17 11:45:26,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),California
9051,67,49.30,2011-08-16 14:09:21,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
8450,67,49.94,2012-04-06 03:03:52,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
...,...,...,...,...,...,...,...,...,...,...
5347,168899,89.55,2012-05-17 07:42:45,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Pennsylvania
5342,168899,90.57,2013-05-22 17:21:10,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Pennsylvania
11202,168899,92.40,2015-03-23 21:53:17,"FLU,3 YRS (HISTORICAL)",,,Non-VA Location,88.0,Not specified (no value),Pennsylvania
10848,168899,93.48,2016-04-22 09:24:59,PNEUMOCOCCAL CONJUGATE PCV 13,,,,133.0,Not specified (no value),Pennsylvania


In [56]:
# Convert 'Age at measurement' column from object to float
immunizations_train["Age at immunization"] = immunizations_train["Age at immunization"].astype(float)

In [57]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
12616,67,48.28,2010-08-10 01:33:24,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
12617,67,48.28,2010-08-10 01:33:24,"TDAP (Tetanus,Diphtheria,Pertussis) (HISTORICAL)",,,,115.0,Not specified (no value),California
9187,67,48.31,2010-08-17 11:45:26,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),California
9051,67,49.30,2011-08-16 14:09:21,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
8450,67,49.94,2012-04-06 03:03:52,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),California
...,...,...,...,...,...,...,...,...,...,...
5347,168899,89.55,2012-05-17 07:42:45,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Pennsylvania
5342,168899,90.57,2013-05-22 17:21:10,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Pennsylvania
11202,168899,92.40,2015-03-23 21:53:17,"FLU,3 YRS (HISTORICAL)",,,Non-VA Location,88.0,Not specified (no value),Pennsylvania
10848,168899,93.48,2016-04-22 09:24:59,PNEUMOCOCCAL CONJUGATE PCV 13,,,,133.0,Not specified (no value),Pennsylvania


# Max age

In [59]:
# Find the maximum age for each internal patient id
max_ages = immunizations_train.groupby('Internalpatientid')['Age at immunization'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
immunizations_train = pd.merge(immunizations_train, max_ages, on =['Internalpatientid','Age at immunization'], how = 'inner')

# Rounding off the Age 

In [60]:
immunizations_train['Age at immunization'] = immunizations_train['Age at immunization'].apply(lambda x: round(x,))

In [61]:
len(immunizations_train["Internalpatientid"].value_counts())

939

# Adding  'Age at immunization' & 'Immunization' columns with '_'

In [62]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
immunizations_train['immunization'] =  immunizations_train['Age at immunization'].astype(str) + '_' +  immunizations_train['Immunization']

# Dropping all unnecessary columns

In [63]:
# Remove columns 
immunizations_train.drop(['Age at immunization','Immunization date','Immunization','Dose quantity','Dose unit','Administered elsewhere','Cvx code','Series doses','State'], axis=1,inplace=True)

In [64]:
immunizations_train = immunizations_train

In [65]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = immunizations_train.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_imm = df_grouped.reset_index()
df_grouped_imm

Unnamed: 0,Internalpatientid,immunization
0,67,"58_COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/..."
1,200,"87_INFLUENZA VACCINE, QUADRIVALENT, ADJUVANTED"
2,291,"82_FLU,3 YRS (HISTORICAL),82_INFLUENZA, UNSPEC..."
3,330,"75_COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/..."
4,351,82_TDAP
5,714,"67_INFLUENZA, UNSPECIFIED FORMULATION"
6,785,"78_INFLUENZA, UNSPECIFIED FORMULATION,78_FLU,3..."
7,960,60_TD(ADULT) UNSPECIFIED FORMULATION
8,1429,"81_PNEUMOCOCCAL, UNSPECIFIED FORMULATION,81_IN..."
9,1451,"86_COVID-19 (MODERNA), MRNA, LNP-S, PF, 100 MC..."


In [32]:
# Checking row values if it is correctly printed like A, B, C...
column_value = df_grouped_imm['immunization'].values[5]
print(column_value)

69_COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/0.3 ML DOSE, TRIS-SUCROSE (AGES 12+ YEARS)


In [66]:
df_grouped_imm.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_quality/df_immunizations_quality.csv')