# **Immunization Notebook**

In [1]:
#A class attribute that provides access to the TabularDatasetFactory methods for creating new TabularDataset objects. 
#Usage: Dataset.Tabular.from_delimited_files().
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b' #check the launch studio there will get this id
resource_group = 'VChamp-Team3' # resource group name
workspace_name = 'vchamp-team3' # worksapce name


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.
#Constructor
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
#['data_team3_synthetic_test']
datastore = workspace.datastores['data_team3_synthetic_test'] 

In [4]:
#from_delimited_files (Create a TabularDataset to represent tabular data in delimited files (e.g. CSV and TSV).

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'immunization_test.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [5]:
#Converting the dataset into data frame(default as dataset in Azure, thus we must convert the needed formate)
immunizations_test = dataset.to_pandas_dataframe()

In [6]:
immunizations_test

Unnamed: 0,Column1,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
0,0,100,52.898575,2014-08-21 10:17:59,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),New York
1,1,100,52.898575,2014-08-21 10:17:59,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),New York
2,2,100,52.898575,2014-08-21 10:17:59,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),New York
3,28,100041,88.331313,2013-08-05 02:16:33,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Minnesota
4,29,100041,88.331313,2013-08-05 02:16:33,"NOVEL INFLUENZA-H1N1-09, ALL FORMULATIONS",,,,128.0,Not specified (no value),Minnesota
...,...,...,...,...,...,...,...,...,...,...,...
486629,2347834,99958,79.600992,2005-09-15 03:54:19,"INFLUENZA, WHOLE",,,,16.0,Not specified (no value),New York
486630,2347835,99958,79.600992,2005-09-15 03:54:19,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),New York
486631,2347849,99979,57.915172,2005-04-04 23:53:11,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),North Carolina
486632,2347859,99997,85.682957,2014-10-28 02:53:46,ZOSTER LIVE,,,,121.0,Not specified (no value),North Carolina


# Importing library

In [7]:
import numpy as np                # Multi-Dimensional array object
import pandas as pd               # Data Manipulation
import matplotlib.pyplot as plt   # Data Visualization
import seaborn as sns             # Data Visualization 

# Allows the use of display() for DataFrames
from IPython.display import display 

# Handle warning messages that may occur during code execution
import warnings
warnings.filterwarnings('ignore')

# To increase memory allocation
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'max_buffer_size': 7294967296})

# Set the maximum number of rows to be displayed to 1000
pd.set_option('display.max_rows', 1163)

# Elimination of the unnamed columns

In [8]:
# Remove unwanted columns 
immunizations_test.drop('Column1',axis=1,inplace=True)

In [9]:
immunizations_test = immunizations_test

In [10]:
immunizations_test

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
0,100,52.898575,2014-08-21 10:17:59,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Not specified (no value),New York
1,100,52.898575,2014-08-21 10:17:59,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),New York
2,100,52.898575,2014-08-21 10:17:59,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),New York
3,100041,88.331313,2013-08-05 02:16:33,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Minnesota
4,100041,88.331313,2013-08-05 02:16:33,"NOVEL INFLUENZA-H1N1-09, ALL FORMULATIONS",,,,128.0,Not specified (no value),Minnesota
...,...,...,...,...,...,...,...,...,...,...
486629,99958,79.600992,2005-09-15 03:54:19,"INFLUENZA, WHOLE",,,,16.0,Not specified (no value),New York
486630,99958,79.600992,2005-09-15 03:54:19,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),New York
486631,99979,57.915172,2005-04-04 23:53:11,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),North Carolina
486632,99997,85.682957,2014-10-28 02:53:46,ZOSTER LIVE,,,,121.0,Not specified (no value),North Carolina


# Sorting out patients id and age column

In [11]:
# Sort the dataset based on the patientsid and 'age' column in ascending order
immunizations_test.sort_values(["Internalpatientid","Age at immunization"],inplace=True)

In [12]:
immunizations_test

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
307933,6,74.147397,2000-10-29 20:58:06,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Texas
448383,6,74.228196,2000-11-28 09:43:53,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Texas
312369,6,74.842469,2001-07-10 22:08:13,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Complete,Texas
312370,6,74.842469,2001-07-10 22:08:13,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Complete,Texas
312371,6,74.842469,2001-07-10 22:08:13,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Texas
...,...,...,...,...,...,...,...,...,...,...
389580,169065,50.864468,2008-12-25 23:06:38,"INFLUENZA VIRUS VACCINE, 3YR AND OLDER (HISTOR...",,,,88.0,Not specified (no value),California
360795,169065,50.981669,2009-02-06 19:12:02,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona
200927,169065,51.861766,2009-12-25 11:24:29,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona
192639,169065,52.864184,2010-12-26 20:37:16,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona


# Checking the missing value

In [13]:
print("Training set missing values:\n", immunizations_test.isna().sum())

Training set missing values:
 Internalpatientid              0
Age at immunization            0
Immunization date              0
Immunization                   4
Dose quantity             483403
Dose unit                 483404
Administered elsewhere    417411
Cvx code                    4768
Series doses                   0
State                          0
dtype: int64


# Removing the missing value from "Immunization" column

In [14]:
# Remove rows missing values in the "Immunization" column
immunizations_test.dropna(subset=["Immunization"],inplace=True)

In [15]:
immunizations_test = immunizations_test

In [16]:
immunizations_test

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
307933,6,74.147397,2000-10-29 20:58:06,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Texas
448383,6,74.228196,2000-11-28 09:43:53,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Texas
312369,6,74.842469,2001-07-10 22:08:13,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Complete,Texas
312370,6,74.842469,2001-07-10 22:08:13,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Complete,Texas
312371,6,74.842469,2001-07-10 22:08:13,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Texas
...,...,...,...,...,...,...,...,...,...,...
389580,169065,50.864468,2008-12-25 23:06:38,"INFLUENZA VIRUS VACCINE, 3YR AND OLDER (HISTOR...",,,,88.0,Not specified (no value),California
360795,169065,50.981669,2009-02-06 19:12:02,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona
200927,169065,51.861766,2009-12-25 11:24:29,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona
192639,169065,52.864184,2010-12-26 20:37:16,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona


In [17]:
# Checking the missing value after removing 
print("Training set missing values:\n", immunizations_test.isna().sum())

Training set missing values:
 Internalpatientid              0
Age at immunization            0
Immunization date              0
Immunization                   0
Dose quantity             483399
Dose unit                 483400
Administered elsewhere    417407
Cvx code                    4765
Series doses                   0
State                          0
dtype: int64


# Taking the one digit in age coulmn

In [18]:
# Format 'Age at immunizations' values in the format 00.00
immunizations_test["Age at immunization"] = immunizations_test["Age at immunization"].map("{:.2f}".format)

In [19]:
immunizations_test

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
307933,6,74.15,2000-10-29 20:58:06,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Texas
448383,6,74.23,2000-11-28 09:43:53,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Texas
312369,6,74.84,2001-07-10 22:08:13,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Complete,Texas
312370,6,74.84,2001-07-10 22:08:13,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Complete,Texas
312371,6,74.84,2001-07-10 22:08:13,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Texas
...,...,...,...,...,...,...,...,...,...,...
389580,169065,50.86,2008-12-25 23:06:38,"INFLUENZA VIRUS VACCINE, 3YR AND OLDER (HISTOR...",,,,88.0,Not specified (no value),California
360795,169065,50.98,2009-02-06 19:12:02,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona
200927,169065,51.86,2009-12-25 11:24:29,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona
192639,169065,52.86,2010-12-26 20:37:16,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona


In [20]:
# Convert 'Age at immunization' column from object to float
immunizations_test["Age at immunization"] = immunizations_test["Age at immunization"].astype(float)

In [21]:
immunizations_test

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
307933,6,74.15,2000-10-29 20:58:06,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Texas
448383,6,74.23,2000-11-28 09:43:53,TD(ADULT) UNSPECIFIED FORMULATION,,,,139.0,Not specified (no value),Texas
312369,6,74.84,2001-07-10 22:08:13,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,,,109.0,Complete,Texas
312370,6,74.84,2001-07-10 22:08:13,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Complete,Texas
312371,6,74.84,2001-07-10 22:08:13,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Texas
...,...,...,...,...,...,...,...,...,...,...
389580,169065,50.86,2008-12-25 23:06:38,"INFLUENZA VIRUS VACCINE, 3YR AND OLDER (HISTOR...",,,,88.0,Not specified (no value),California
360795,169065,50.98,2009-02-06 19:12:02,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona
200927,169065,51.86,2009-12-25 11:24:29,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona
192639,169065,52.86,2010-12-26 20:37:16,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona


# Taking the Max age

In [22]:
# Find the latest record for each internal patient id
max_ages = immunizations_test.groupby('Internalpatientid')['Age at immunization'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age
immunizations_test = pd.merge(immunizations_test, max_ages, on =['Internalpatientid','Age at immunization'], how = 'inner')

immunizations_test

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
0,6,87.82,2014-07-04 12:50:01,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Texas
1,7,72.70,2019-03-21 20:17:01,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Not specified (no value),Wisconsin
2,12,71.47,2001-11-29 07:13:25,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Colorado
3,17,82.22,2004-05-13 14:18:11,INFLUENZA (HISTORICAL),,,,88.0,Not specified (no value),Maryland
4,17,82.22,2004-05-13 14:18:11,"INFLUENZA, WHOLE",,,,16.0,Not specified (no value),Maryland
...,...,...,...,...,...,...,...,...,...,...
39967,169024,71.44,2010-12-01 00:05:07,INFLUENZA (HISTORICAL),,,,88.0,Not specified (no value),Alabama
39968,169037,87.41,2017-06-23 16:37:46,ZOSTER RECOMBINANT,,,,187.0,2,Illinois
39969,169045,97.36,2021-07-25 21:32:52,"COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/0.3...",,,,208.0,2,Minnesota
39970,169059,90.22,2013-06-28 10:41:26,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona


# Rounding off the Age column

In [23]:
immunizations_test['Age at immunization'] = immunizations_test['Age at immunization'].apply(lambda x: round(x,))

In [24]:
immunizations_test

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State
0,6,88,2014-07-04 12:50:01,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Texas
1,7,73,2019-03-21 20:17:01,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Not specified (no value),Wisconsin
2,12,71,2001-11-29 07:13:25,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Colorado
3,17,82,2004-05-13 14:18:11,INFLUENZA (HISTORICAL),,,,88.0,Not specified (no value),Maryland
4,17,82,2004-05-13 14:18:11,"INFLUENZA, WHOLE",,,,16.0,Not specified (no value),Maryland
...,...,...,...,...,...,...,...,...,...,...
39967,169024,71,2010-12-01 00:05:07,INFLUENZA (HISTORICAL),,,,88.0,Not specified (no value),Alabama
39968,169037,87,2017-06-23 16:37:46,ZOSTER RECOMBINANT,,,,187.0,2,Illinois
39969,169045,97,2021-07-25 21:32:52,"COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/0.3...",,,,208.0,2,Minnesota
39970,169059,90,2013-06-28 10:41:26,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona


In [25]:
immunizations_test["Internalpatientid"].value_counts()

35686     22
73986     14
79654     13
118861    13
82946     13
          ..
150141     1
164343     1
154239     1
80714      1
131072     1
Name: Internalpatientid, Length: 32784, dtype: int64

# Adding  'Age at immunization' & 'Immunization' columns with '_'

In [26]:
#In this code, we create a new column called 'Lab test description_Result numeric' by concatenating the values of the 'lab test description' column, underscore ('_'), and the 'lab test results' column
#converted to a string using the `astype(str)` function(because we cant't add differnt dtype)
immunizations_test['immunization'] =  immunizations_test['Age at immunization'].astype(str) + '_' +  immunizations_test['Immunization']

In [27]:
immunizations_test

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Dose quantity,Dose unit,Administered elsewhere,Cvx code,Series doses,State,immunization
0,6,88,2014-07-04 12:50:01,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Texas,"88_FLU,3 YRS (HISTORICAL)"
1,7,73,2019-03-21 20:17:01,"INFLUENZA, INJECTABLE, QUADRIVALENT, PRESERVAT...",,,,150.0,Not specified (no value),Wisconsin,"73_INFLUENZA, INJECTABLE, QUADRIVALENT, PRESER..."
2,12,71,2001-11-29 07:13:25,"FLU,3 YRS (HISTORICAL)",,,,88.0,Not specified (no value),Colorado,"71_FLU,3 YRS (HISTORICAL)"
3,17,82,2004-05-13 14:18:11,INFLUENZA (HISTORICAL),,,,88.0,Not specified (no value),Maryland,82_INFLUENZA (HISTORICAL)
4,17,82,2004-05-13 14:18:11,"INFLUENZA, WHOLE",,,,16.0,Not specified (no value),Maryland,"82_INFLUENZA, WHOLE"
...,...,...,...,...,...,...,...,...,...,...,...
39967,169024,71,2010-12-01 00:05:07,INFLUENZA (HISTORICAL),,,,88.0,Not specified (no value),Alabama,71_INFLUENZA (HISTORICAL)
39968,169037,87,2017-06-23 16:37:46,ZOSTER RECOMBINANT,,,,187.0,2,Illinois,87_ZOSTER RECOMBINANT
39969,169045,97,2021-07-25 21:32:52,"COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/0.3...",,,,208.0,2,Minnesota,"97_COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/..."
39970,169059,90,2013-06-28 10:41:26,"INFLUENZA, UNSPECIFIED FORMULATION",,,,88.0,Not specified (no value),Arizona,"90_INFLUENZA, UNSPECIFIED FORMULATION"


# Dropping all unnecessary columns

In [31]:
# Remove columns 
immunizations_test.drop(['Age at immunization','Immunization date','Immunization','Dose quantity','Dose unit','Administered elsewhere','Cvx code','Series doses','State'], axis=1,inplace=True)

In [32]:
immunizations_test = immunizations_test

In [33]:
immunizations_test

Unnamed: 0,Internalpatientid,immunization
0,6,"88_FLU,3 YRS (HISTORICAL)"
1,7,"73_INFLUENZA, INJECTABLE, QUADRIVALENT, PRESER..."
2,12,"71_FLU,3 YRS (HISTORICAL)"
3,17,82_INFLUENZA (HISTORICAL)
4,17,"82_INFLUENZA, WHOLE"
...,...,...
39967,169024,71_INFLUENZA (HISTORICAL)
39968,169037,87_ZOSTER RECOMBINANT
39969,169045,"97_COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/..."
39970,169059,"90_INFLUENZA, UNSPECIFIED FORMULATION"


In [34]:
# Group the DataFrame by 'Internalpatientid' and concatenate the icd10 values and
#The purpose of `x.dropna()` is to remove any missing values from the Series before applying a subsequent operation,
# such as concatenation using the `join` function. By dropping the missing values,
#we ensure that only non-null values are included in the resulting concatenated string.
df_grouped = immunizations_test.groupby('Internalpatientid').agg(lambda x: ','.join(x.dropna()))
df_grouped 

# Reset the index of the grouped DataFrame
df_grouped_imm = df_grouped.reset_index()
df_grouped_imm

Unnamed: 0,Internalpatientid,immunization
0,6,"88_FLU,3 YRS (HISTORICAL)"
1,7,"73_INFLUENZA, INJECTABLE, QUADRIVALENT, PRESER..."
2,12,"71_FLU,3 YRS (HISTORICAL)"
3,17,"82_INFLUENZA (HISTORICAL),82_INFLUENZA, WHOLE"
4,22,"55_PNEUMOCOCCAL, UNSPECIFIED FORMULATION"
...,...,...
32779,169024,71_INFLUENZA (HISTORICAL)
32780,169037,87_ZOSTER RECOMBINANT
32781,169045,"97_COVID-19 (PFIZER), MRNA, LNP-S, PF, 30 MCG/..."
32782,169059,"90_INFLUENZA, UNSPECIFIED FORMULATION"


In [35]:
# Checking row values if it is correctly printed like A, B, C...
column_value = df_grouped_imm['immunization'].values[5]
print(column_value)

70_COVID-19 (MODERNA), MRNA, LNP-S, PF, 100 MCG/0.5ML DOSE OR 50 MCG/0.25ML DOSE


# Saving the file

In [36]:
df_grouped_imm.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team-3-susai-gpu2/code/Users/900379/Output_files_test/df_immunizations_test.csv')