In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
#['data_team3_synthetic_train']
datastore = workspace.datastores['data_team3_synthetic_train'] 

In [3]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'immunization_train.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [4]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
immunizations_train = dataset.to_pandas_dataframe()

In [5]:
immunizations_train

Unnamed: 0,Column1,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
0,3,100000,63.565815,2019-10-03 07:20:00,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),Ohio
1,4,100000,63.565815,2019-10-03 07:20:00,PNEUMOCOCCAL POLYSACCHARIDE PPV23,,,,33.0,Not specified (no value),Ohio
2,5,100001,84.407099,2010-05-23 19:04:26,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),North Carolina
3,6,100008,70.572815,2019-10-06 08:44:11,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Not specified (no value),North Carolina
4,7,100008,71.570887,2020-10-05 03:49:18,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Complete,North Carolina
...,...,...,...,...,...,...,...,...,...,...,...
1847640,2347857,99988,77.191685,2003-12-20 13:27:45,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Florida
1847641,2347858,99993,58.944520,2008-01-24 05:19:38,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),Kentucky
1847642,2347861,99999,87.997774,2004-11-27 04:28:15,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),California
1847643,2347862,99999,89.989158,2006-11-25 00:53:27,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),California


In [6]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

# Set the maximum number of rows to be displayed to 1000
pd.set_option('display.max_rows', 1163)

# Elimination of the unnamed columns

In [8]:
# Remove unwanted columns 
immunizations_train.drop('Column1',axis=1,inplace=True)

In [9]:
immunizations_train = immunizations_train

In [10]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
0,100000,63.565815,2019-10-03 07:20:00,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),Ohio
1,100000,63.565815,2019-10-03 07:20:00,PNEUMOCOCCAL POLYSACCHARIDE PPV23,,,,33.0,Not specified (no value),Ohio
2,100001,84.407099,2010-05-23 19:04:26,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),North Carolina
3,100008,70.572815,2019-10-06 08:44:11,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Not specified (no value),North Carolina
4,100008,71.570887,2020-10-05 03:49:18,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Complete,North Carolina
...,...,...,...,...,...,...,...,...,...,...
1847640,99988,77.191685,2003-12-20 13:27:45,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Florida
1847641,99993,58.944520,2008-01-24 05:19:38,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),Kentucky
1847642,99999,87.997774,2004-11-27 04:28:15,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),California
1847643,99999,89.989158,2006-11-25 00:53:27,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),California


# sorting out patients id and age column

In [11]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
immunizations_train.sort_values(["Internalpatientid","Age at immunization"],inplace=True)

In [12]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
28045,1,57.573116,2001-10-30 23:22:22,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Indiana
32065,1,58.636284,2002-11-23 13:29:02,INFLUENZA (HISTORICAL),,,,88.0,Complete,Indiana
30498,1,61.554094,2005-10-25 00:31:01,INFLUENZA (HISTORICAL),,,,88.0,Complete,Indiana
30499,1,61.554094,2005-10-25 00:31:01,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Indiana
809,1,62.296668,2006-07-23 10:22:13,PNEUMOCOCCAL POLYSACCHARIDE PPV23,,,,33.0,Complete,Indiana
...,...,...,...,...,...,...,...,...,...,...
683434,169064,82.300501,2009-04-05 19:59:40,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
755077,169064,84.360394,2011-04-28 17:22:31,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
754183,169064,85.213023,2012-03-05 08:38:31,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
1474476,169064,86.408098,2013-05-16 03:50:05,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming


# Checking the missing value

In [13]:
print("Training set missing values:\n", immunizations_train.isna().sum())

Training set missing values:
 Internalpatientid               0
Age at immunization             0
Immunization date               0
Immunization                    3
Dose quantity             1836499
Dose unit                 1836502
Administered elsewhere    1585089
Cvx code                    18322
Series doses                    0
State                           0
dtype: int64


* **There are many missing values in the column "Dose quantity"(1836499),"Dose unit"(1836502),"Administered elsewhere"(1585089) "Cvx code"(121), and "Immunization" (3) And other varible have no missing values in this dataset.**

**Note:Here potiential attributes is Immunization, so it will have three mising value we can remove.**

# Removing the missing value

In [14]:
# Remove rows missing values in the "Immunization" column
immunizations_train.dropna(subset=["Immunization"],inplace=True)

In [15]:
immunizations_train = immunizations_train

In [16]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
28045,1,57.573116,2001-10-30 23:22:22,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Indiana
32065,1,58.636284,2002-11-23 13:29:02,INFLUENZA (HISTORICAL),,,,88.0,Complete,Indiana
30498,1,61.554094,2005-10-25 00:31:01,INFLUENZA (HISTORICAL),,,,88.0,Complete,Indiana
30499,1,61.554094,2005-10-25 00:31:01,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Indiana
809,1,62.296668,2006-07-23 10:22:13,PNEUMOCOCCAL POLYSACCHARIDE PPV23,,,,33.0,Complete,Indiana
...,...,...,...,...,...,...,...,...,...,...
683434,169064,82.300501,2009-04-05 19:59:40,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
755077,169064,84.360394,2011-04-28 17:22:31,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
754183,169064,85.213023,2012-03-05 08:38:31,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
1474476,169064,86.408098,2013-05-16 03:50:05,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming


In [17]:
# Checking the missing value after removing 
print("Training set missing values:\n", immunizations_train.isna().sum())

Training set missing values:
 Internalpatientid               0
Age at immunization             0
Immunization date               0
Immunization                    0
Dose quantity             1836496
Dose unit                 1836499
Administered elsewhere    1585086
Cvx code                    18321
Series doses                    0
State                           0
dtype: int64


# Taking the one digit in age coulmn

In [18]:
# Format 'Age at measurement' values in the format 00.00
immunizations_train["Age at immunization"] = immunizations_train["Age at immunization"].map("{:.2f}".format)

In [19]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
28045,1,57.57,2001-10-30 23:22:22,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Indiana
32065,1,58.64,2002-11-23 13:29:02,INFLUENZA (HISTORICAL),,,,88.0,Complete,Indiana
30498,1,61.55,2005-10-25 00:31:01,INFLUENZA (HISTORICAL),,,,88.0,Complete,Indiana
30499,1,61.55,2005-10-25 00:31:01,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Indiana
809,1,62.30,2006-07-23 10:22:13,PNEUMOCOCCAL POLYSACCHARIDE PPV23,,,,33.0,Complete,Indiana
...,...,...,...,...,...,...,...,...,...,...
683434,169064,82.30,2009-04-05 19:59:40,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
755077,169064,84.36,2011-04-28 17:22:31,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
754183,169064,85.21,2012-03-05 08:38:31,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
1474476,169064,86.41,2013-05-16 03:50:05,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming


In [20]:
# Convert 'Age at measurement' column from object to float
immunizations_train["Age at immunization"] = immunizations_train["Age at immunization"].astype(float)

In [21]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
28045,1,57.57,2001-10-30 23:22:22,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Indiana
32065,1,58.64,2002-11-23 13:29:02,INFLUENZA (HISTORICAL),,,,88.0,Complete,Indiana
30498,1,61.55,2005-10-25 00:31:01,INFLUENZA (HISTORICAL),,,,88.0,Complete,Indiana
30499,1,61.55,2005-10-25 00:31:01,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Indiana
809,1,62.30,2006-07-23 10:22:13,PNEUMOCOCCAL POLYSACCHARIDE PPV23,,,,33.0,Complete,Indiana
...,...,...,...,...,...,...,...,...,...,...
683434,169064,82.30,2009-04-05 19:59:40,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
755077,169064,84.36,2011-04-28 17:22:31,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
754183,169064,85.21,2012-03-05 08:38:31,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming
1474476,169064,86.41,2013-05-16 03:50:05,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Wyoming


# Max age

In [22]:
# Find the maximum age for each internal patient id
max_ages = immunizations_train.groupby('Internalpatientid')['Age at immunization'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
immunizations_train = pd.merge(immunizations_train, max_ages, on =['Internalpatientid','Age at immunization'], how = 'inner')

immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
0,1,79.56,2023-11-01 05:41:33,"INFLUENZA VACCINE, QUADRIVALENT, ADJUVANTED",,,,205.0,Not specified (no value),Indiana
1,2,68.81,2023-07-15 21:41:08,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Texas
2,2,68.81,2023-07-16 12:26:13,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Not specified (no value),Texas
3,3,82.01,2005-07-04 06:19:47,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Florida
4,4,83.79,2014-09-30 11:44:52,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Puerto Rico
...,...,...,...,...,...,...,...,...,...,...
152518,169057,85.94,2025-04-06 04:16:07,"COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/0.3...",,,,217.0,4,California
152519,169061,79.83,2021-12-25 15:19:53,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Not specified (no value),Georgia
152520,169062,71.82,2003-06-03 13:10:39,PNEUMOCOCCAL (HISTORICAL),,,,109.0,Not specified (no value),Florida
152521,169063,76.50,2004-02-10 03:09:08,"FLU,3 YRS (HISTORICAL)",,,Watkins Glen,88.0,Not specified (no value),New York


# Rounding off the Age 

In [23]:
immunizations_train['Age at immunization'] = immunizations_train['Age at immunization'].apply(lambda x: round(x,))

In [24]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
0,1,80,2023-11-01 05:41:33,"INFLUENZA VACCINE, QUADRIVALENT, ADJUVANTED",,,,205.0,Not specified (no value),Indiana
1,2,69,2023-07-15 21:41:08,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Texas
2,2,69,2023-07-16 12:26:13,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Not specified (no value),Texas
3,3,82,2005-07-04 06:19:47,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Florida
4,4,84,2014-09-30 11:44:52,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Puerto Rico
...,...,...,...,...,...,...,...,...,...,...
152518,169057,86,2025-04-06 04:16:07,"COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/0.3...",,,,217.0,4,California
152519,169061,80,2021-12-25 15:19:53,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Not specified (no value),Georgia
152520,169062,72,2003-06-03 13:10:39,PNEUMOCOCCAL (HISTORICAL),,,,109.0,Not specified (no value),Florida
152521,169063,76,2004-02-10 03:09:08,"FLU,3 YRS (HISTORICAL)",,,Watkins Glen,88.0,Not specified (no value),New York


In [25]:
immunizations_train["Internalpatientid"].value_counts()

96019     26
78803     26
71510     20
100953    16
88394     14
          ..
8533       1
12631      1
18776      1
22874      1
20554      1
Name: Internalpatientid, Length: 125159, dtype: int64

# Adding  'Age at immunization' & 'Immunization' columns with '_'

In [26]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
immunizations_train['immunization'] =  immunizations_train['Age at immunization'].astype(str) + '_' +  immunizations_train['Immunization']

In [27]:
immunizations_train

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State,immunization
0,1,80,2023-11-01 05:41:33,"INFLUENZA VACCINE, QUADRIVALENT, ADJUVANTED",,,,205.0,Not specified (no value),Indiana,"80_INFLUENZA VACCINE, QUADRIVALENT, ADJUVANTED"
1,2,69,2023-07-15 21:41:08,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Texas,"69_INFLUENZA, UNSPECIFIED FORMULATION"
2,2,69,2023-07-16 12:26:13,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Not specified (no value),Texas,"69_INFLUENZA, INJECTABLE, QUADRIVALENT, PRESER..."
3,3,82,2005-07-04 06:19:47,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Florida,"82_INFLUENZA, UNSPECIFIED FORMULATION"
4,4,84,2014-09-30 11:44:52,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Puerto Rico,"84_FLU,3 YRS (HISTORICAL)"
...,...,...,...,...,...,...,...,...,...,...,...
152518,169057,86,2025-04-06 04:16:07,"COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/0.3...",,,,217.0,4,California,"86_COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/..."
152519,169061,80,2021-12-25 15:19:53,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Not specified (no value),Georgia,"80_INFLUENZA, INJECTABLE, QUADRIVALENT, PRESER..."
152520,169062,72,2003-06-03 13:10:39,PNEUMOCOCCAL (HISTORICAL),,,,109.0,Not specified (no value),Florida,72_PNEUMOCOCCAL (HISTORICAL)
152521,169063,76,2004-02-10 03:09:08,"FLU,3 YRS (HISTORICAL)",,,Watkins Glen,88.0,Not specified (no value),New York,"76_FLU,3 YRS (HISTORICAL)"


# Dropping all unnecessary columns

In [28]:
# Remove columns 
immunizations_train.drop(['Age at immunization','Immunization date','Immunization','Dose quantity','Dose unit','Administered elsewhere','Cvx code','Series doses','State'], axis=1,inplace=True)

In [29]:
immunizations_train = immunizations_train

In [30]:
immunizations_train

Unnamed: 0,Internalpatientid,immunization
0,1,"80_INFLUENZA VACCINE, QUADRIVALENT, ADJUVANTED"
1,2,"69_INFLUENZA, UNSPECIFIED FORMULATION"
2,2,"69_INFLUENZA, INJECTABLE, QUADRIVALENT, PRESER..."
3,3,"82_INFLUENZA, UNSPECIFIED FORMULATION"
4,4,"84_FLU,3 YRS (HISTORICAL)"
...,...,...
152518,169057,"86_COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/..."
152519,169061,"80_INFLUENZA, INJECTABLE, QUADRIVALENT, PRESER..."
152520,169062,72_PNEUMOCOCCAL (HISTORICAL)
152521,169063,"76_FLU,3 YRS (HISTORICAL)"


In [31]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = immunizations_train.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_imm = df_grouped.reset_index()
df_grouped_imm

Unnamed: 0,Internalpatientid,immunization
0,1,"80_INFLUENZA VACCINE, QUADRIVALENT, ADJUVANTED"
1,2,"69_INFLUENZA, UNSPECIFIED FORMULATION,69_INFLU..."
2,3,"82_INFLUENZA, UNSPECIFIED FORMULATION"
3,4,"84_FLU,3 YRS (HISTORICAL),84_INFLUENZA, SEASON..."
4,5,"76_FLU,3 YRS (HISTORICAL)"
...,...,...
125154,169057,"86_COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/..."
125155,169061,"80_INFLUENZA, INJECTABLE, QUADRIVALENT, PRESER..."
125156,169062,72_PNEUMOCOCCAL (HISTORICAL)
125157,169063,"76_FLU,3 YRS (HISTORICAL)"


In [32]:
# Checking row values if it is correctly printed like A, B, C...
column_value = df_grouped_imm['immunization'].values[5]
print(column_value)

69_COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/0.3 ML DOSE, TRIS-SUCROSE (AGES 12+ YEARS)


In [33]:
df_grouped_imm.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_train/df_immunizations_train.csv')