In [1]:
from azureml.core import Workspace, Dataset

subscription_id = 'bcfe0c62-8ebe-4df0-a46d-1efcf8739a5b'
resource_group = 'VChamp-Team3'
workspace_name = 'vchamp-team3'


# storage account : Algorithmia, Resource group: VChamp-Team3 and workspace: vchamp-team3.

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [2]:
#['data_team3_synthetic_quality_check']
datastore = workspace.datastores['data_team3_synthetic_train']

In [3]:
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'demographics_event_train.csv')])

# preview the first 3 rows of the dataset
# dataset.to_pandas_dataframe()

In [4]:
demographic_event = dataset.to_pandas_dataframe()

In [5]:
type(demographic_event)

pandas.core.frame.DataFrame

In [6]:
demographic_event

Unnamed: 0,Column1,Internalpatientid,Age at update,Event date,Marital status,Ruca category
0,1,100028,78.990830,2020-08-24 03:33:32,Married,Urban
1,2,100032,91.357622,2008-02-07 05:03:27,Married,Rural
2,4,100046,72.961415,2003-09-14 09:32:12,Married,Urban
3,5,100071,75.221222,2019-04-25 08:33:42,Widowed,Urban
4,6,100091,80.955929,2022-02-09 09:24:20,Never married,Urban
...,...,...,...,...,...,...
133247,169059,99898,92.376313,2014-05-12 13:01:07,Married,Urban
133248,169060,9995,79.974938,2011-07-08 20:09:29,Married,Rural
133249,169061,99950,77.154395,2005-12-14 22:21:25,Married,Urban
133250,169062,9998,62.324623,2011-02-23 15:35:32,Divorced,Urban


In [7]:
demographic_event = demographic_event.drop('Column1', axis = 1)
demographic_event.head()

Unnamed: 0,Internalpatientid,Age at update,Event date,Marital status,Ruca category
0,100028,78.99083,2020-08-24 03:33:32,Married,Urban
1,100032,91.357622,2008-02-07 05:03:27,Married,Rural
2,100046,72.961415,2003-09-14 09:32:12,Married,Urban
3,100071,75.221222,2019-04-25 08:33:42,Widowed,Urban
4,100091,80.955929,2022-02-09 09:24:20,Never married,Urban


In [8]:
demographic_event = demographic_event.drop('Event date', axis = 1)
demographic_event.head()

Unnamed: 0,Internalpatientid,Age at update,Marital status,Ruca category
0,100028,78.99083,Married,Urban
1,100032,91.357622,Married,Rural
2,100046,72.961415,Married,Urban
3,100071,75.221222,Widowed,Urban
4,100091,80.955929,Never married,Urban


In [9]:
import pandas as pd

In [10]:
demographic_event['Internalpatientid'].value_counts()

Internalpatientid
100028    1
147590    1
147463    1
147461    1
147460    1
         ..
107923    1
107903    1
10787     1
10774     1
9999      1
Name: count, Length: 133252, dtype: int64

In [11]:
demographic_event.nunique()

Internalpatientid    133252
Age at update        133251
Marital status            8
Ruca category             4
dtype: int64

In [12]:
sorted_df = demographic_event.sort_values(by=['Internalpatientid','Age at update'])

In [13]:
sorted_df

Unnamed: 0,Internalpatientid,Age at update,Marital status,Ruca category
53728,1,80.229505,Married,Urban
16960,2,69.529811,Divorced,Urban
52739,3,83.579763,Married,Urban
62525,4,84.691772,Married,Urban
34394,5,76.712091,Married,Urban
...,...,...,...,...
90380,169060,72.059713,Married,Rural
86147,169061,81.927523,Married,Urban
97541,169062,75.509632,Divorced,Urban
21647,169063,78.489673,Married,Rural


In [14]:
# Finding the missing values
demographic_event.isnull().sum()

Internalpatientid    0
Age at update        0
Marital status       0
Ruca category        0
dtype: int64

In [15]:
# Format 'Age at measurement' values in the format 00.00
sorted_df["Age at update"] = sorted_df["Age at update"].map("{:.2f}".format)
sorted_df

Unnamed: 0,Internalpatientid,Age at update,Marital status,Ruca category
53728,1,80.23,Married,Urban
16960,2,69.53,Divorced,Urban
52739,3,83.58,Married,Urban
62525,4,84.69,Married,Urban
34394,5,76.71,Married,Urban
...,...,...,...,...
90380,169060,72.06,Married,Rural
86147,169061,81.93,Married,Urban
97541,169062,75.51,Divorced,Urban
21647,169063,78.49,Married,Rural


In [16]:
# Find the maximum age for each internal patient id
max_ages = sorted_df.groupby('Internalpatientid')['Age at update'].max().reset_index()

# Merge with the original dataframe to get the rows with the highest age

result_df = pd.merge(sorted_df, max_ages, on =['Internalpatientid','Age at update'], how = 'inner')

result_df

Unnamed: 0,Internalpatientid,Age at update,Marital status,Ruca category
0,1,80.23,Married,Urban
1,2,69.53,Divorced,Urban
2,3,83.58,Married,Urban
3,4,84.69,Married,Urban
4,5,76.71,Married,Urban
...,...,...,...,...
133247,169060,72.06,Married,Rural
133248,169061,81.93,Married,Urban
133249,169062,75.51,Divorced,Urban
133250,169063,78.49,Married,Rural


In [17]:
# Convert 'Age at measurement' column from object to float
result_df["Age at update"] = result_df["Age at update"].astype(float)

 

# Round off the values in the 'Age at measurement' column to two decimal places
result_df["Age at update"] = result_df["Age at update"].round()

In [18]:
result_df['Age at update'] = result_df['Age at update'].astype('int')

In [19]:
# Now the value counts
result_df['Internalpatientid'].value_counts()

Internalpatientid
1         1
112592    1
112590    1
112589    1
112588    1
         ..
56240     1
56237     1
56236     1
56235     1
169064    1
Name: count, Length: 133252, dtype: int64

In [20]:
result_df['demo_event_marital_status'] = result_df['Age at update'].astype(str) + '_' + result_df['Marital status']
result_df

Unnamed: 0,Internalpatientid,Age at update,Marital status,Ruca category,demo_event_marital_status
0,1,80,Married,Urban,80_Married
1,2,70,Divorced,Urban,70_Divorced
2,3,84,Married,Urban,84_Married
3,4,85,Married,Urban,85_Married
4,5,77,Married,Urban,77_Married
...,...,...,...,...,...
133247,169060,72,Married,Rural,72_Married
133248,169061,82,Married,Urban,82_Married
133249,169062,76,Divorced,Urban,76_Divorced
133250,169063,78,Married,Rural,78_Married


In [21]:
result_df.drop(['Age at update','Marital status','Ruca category'], axis = 1,inplace = True)
result_df

Unnamed: 0,Internalpatientid,demo_event_marital_status
0,1,80_Married
1,2,70_Divorced
2,3,84_Married
3,4,85_Married
4,5,77_Married
...,...,...
133247,169060,72_Married
133248,169061,82_Married
133249,169062,76_Divorced
133250,169063,78_Married


In [22]:
result_df['Internalpatientid'].value_counts()

Internalpatientid
1         1
112592    1
112590    1
112589    1
112588    1
         ..
56240     1
56237     1
56236     1
56235     1
169064    1
Name: count, Length: 133252, dtype: int64

In [24]:
# saving as the csv file in the appropriate directory
result_df.to_csv('/mnt/batch/tasks/shared/LS_root/mounts/clusters/team3-nishagpu2/code/Users/900379/Output_files_train/df_demographic_event_train.csv')