In [1]:
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b'
resource_group = 'VChamp-Team3'
workspace_name = 'vchamp-team3'


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
#['data_team3_synthetic_quality_check']
datastore = workspace.datastores['data_team3_synthetic_test']

In [3]:
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'demographics_event_test.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [4]:
demographic_event = dataset.to_pandas_dataframe()

In [5]:
demographic_event

Unnamed: 0,Column1,Internalpatientid,Age at update,Event date,Marital status,Ruca category
0,0,100017,85.721080,2009-05-28 01:18:36,Married,Rural
1,3,100041,97.746361,2023-01-06 07:04:39,Divorced,Urban
2,9,100181,71.684971,2003-01-03 20:34:04,Married,Urban
3,10,100186,69.808196,2023-10-24 21:29:46,Married,Urban
4,12,100242,84.905314,2024-03-29 21:24:39,Divorced,Rural
...,...,...,...,...,...,...
34808,169038,99609,89.901179,2017-12-24 21:08:27,Widowed,Rural
34809,169042,99674,62.751955,2022-10-17 20:08:58,Divorced,Urban
34810,169046,99713,78.949573,2002-02-08 01:39:29,Widowed,Rural
34811,169051,99792,67.020799,2013-06-26 02:27:08,Married,Urban


In [6]:
demographic_event = demographic_event.drop('Column1', axis = 1)
demographic_event.head()

Unnamed: 0,Internalpatientid,Age at update,Event date,Marital status,Ruca category
0,100017,85.72108,2009-05-28 01:18:36,Married,Rural
1,100041,97.746361,2023-01-06 07:04:39,Divorced,Urban
2,100181,71.684971,2003-01-03 20:34:04,Married,Urban
3,100186,69.808196,2023-10-24 21:29:46,Married,Urban
4,100242,84.905314,2024-03-29 21:24:39,Divorced,Rural


In [7]:
demographic_event = demographic_event.drop('Event date', axis = 1)
demographic_event.head()

Unnamed: 0,Internalpatientid,Age at update,Marital status,Ruca category
0,100017,85.72108,Married,Rural
1,100041,97.746361,Divorced,Urban
2,100181,71.684971,Married,Urban
3,100186,69.808196,Married,Urban
4,100242,84.905314,Divorced,Rural


In [8]:
import pandas as pd

In [9]:
demographic_event['Internalpatientid'].value_counts()

Internalpatientid
100017    1
142509    1
143228    1
14320     1
143156    1
         ..
83128     1
83118     1
83106     1
83015     1
99997     1
Name: count, Length: 34813, dtype: int64

In [10]:
sorted_df = demographic_event.sort_values(by=['Internalpatientid','Age at update'])
sorted_df

Unnamed: 0,Internalpatientid,Age at update,Marital status,Ruca category
21592,6,89.589028,Married,Rural
30368,7,74.373609,Married,Rural
10462,9,51.843570,Never married,Rural
19145,12,74.143836,Married,Rural
13179,17,82.419183,Widowed,Urban
...,...,...,...,...
25537,169037,88.082373,Married,Urban
23891,169045,98.707257,Never married,Rural
2806,169058,99.673816,Never married,Rural
25538,169059,91.161149,Widowed,Rural


In [11]:
# Finding the missing values
demographic_event.isnull().sum()

Internalpatientid    0
Age at update        0
Marital status       0
Ruca category        0
dtype: int64

In [12]:
# Format 'Age at measurement' values in the format 00.00
sorted_df["Age at update"] = sorted_df["Age at update"].map("{:.2f}".format)
sorted_df

Unnamed: 0,Internalpatientid,Age at update,Marital status,Ruca category
21592,6,89.59,Married,Rural
30368,7,74.37,Married,Rural
10462,9,51.84,Never married,Rural
19145,12,74.14,Married,Rural
13179,17,82.42,Widowed,Urban
...,...,...,...,...
25537,169037,88.08,Married,Urban
23891,169045,98.71,Never married,Rural
2806,169058,99.67,Never married,Rural
25538,169059,91.16,Widowed,Rural


In [13]:
# Find the maximum age for each internal patient id
max_ages = sorted_df.groupby('Internalpatientid')['Age at update'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age

result_df = pd.merge(sorted_df, max_ages, on =['Internalpatientid','Age at update'], how = 'inner')

result_df

Unnamed: 0,Internalpatientid,Age at update,Marital status,Ruca category
0,6,89.59,Married,Rural
1,7,74.37,Married,Rural
2,9,51.84,Never married,Rural
3,12,74.14,Married,Rural
4,17,82.42,Widowed,Urban
...,...,...,...,...
34808,169037,88.08,Married,Urban
34809,169045,98.71,Never married,Rural
34810,169058,99.67,Never married,Rural
34811,169059,91.16,Widowed,Rural


In [14]:
# Convert 'Age at measurement' column from object to float
result_df["Age at update"] = result_df["Age at update"].astype(float)

 

# Round off the values in the 'Age at measurement' column to two decimal places
result_df["Age at update"] = result_df["Age at update"].round()

In [15]:
result_df['Age at update'] = result_df['Age at update'].astype('int')

In [16]:
result_df['demo_event_marital_status'] = result_df['Age at update'].astype(str) + '_' + result_df['Marital status']
result_df

Unnamed: 0,Internalpatientid,Age at update,Marital status,Ruca category,demo_event_marital_status
0,6,90,Married,Rural,90_Married
1,7,74,Married,Rural,74_Married
2,9,52,Never married,Rural,52_Never married
3,12,74,Married,Rural,74_Married
4,17,82,Widowed,Urban,82_Widowed
...,...,...,...,...,...
34808,169037,88,Married,Urban,88_Married
34809,169045,99,Never married,Rural,99_Never married
34810,169058,100,Never married,Rural,100_Never married
34811,169059,91,Widowed,Rural,91_Widowed


In [17]:
result_df.drop(['Age at update','Marital status','Ruca category'], axis = 1,inplace = True)
result_df

Unnamed: 0,Internalpatientid,demo_event_marital_status
0,6,90_Married
1,7,74_Married
2,9,52_Never married
3,12,74_Married
4,17,82_Widowed
...,...,...
34808,169037,88_Married
34809,169045,99_Never married
34810,169058,100_Never married
34811,169059,91_Widowed


In [18]:
# saving as the csv file in the appropriate directory
result_df.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team3-nishagpu2/code/Users/900379/Output_files_test/df_demographic_event_test.csv')