# Log preparation

## Import packages

In [342]:
import pandas as pd

## Import datasets

In [375]:
## Log data
diagnoses_data = pd.read_csv(##insertdataset, index_col = 0)

In [376]:
len(diagnoses_data)

7948

In [377]:
## Conversion table
conversion = pd.read_csv('Data/conversion_table.csv', sep = ';')

## Add the new activity descriptions to the set

In [378]:
## Create a key for both frames
conversion['key'] = conversion['Activity code'] + conversion['Producer code']
diagnoses_data['key'] = diagnoses_data['Activity code'] + diagnoses_data['Producer code']

In [379]:
## Remove the activity code en producer columns
columns = ['Activity code', 'Producer code']
conversion.drop(columns=columns, axis=1, inplace=True)
diagnoses_data.drop(columns=columns, axis=1, inplace=True)

In [380]:
## Merge the frames together
diagnoses_data = diagnoses_data.merge(conversion, on = 'key', how = 'left')

In [381]:
## Drop the key column
conversion.drop(columns=['key'], axis=1, inplace=True)
diagnoses_data.drop(columns=['key'], axis=1, inplace=True)

In [382]:
diagnoses_data.head(2)

Unnamed: 0,case:concept:name,event:concept:name,time:timestamp,diagnosis,new:event:concept:name
0,177,vervolgconsult poliklinisch,2005-04-13T01:00:00+02:00,"{'Plaveiselcelca, cervix st Ib', 'maligniteit ...",vervolgconsult poliklinisch
1,177,administratief tarief - eerste pol,2005-04-13T01:00:00+02:00,"{'Plaveiselcelca, cervix st Ib', 'maligniteit ...",administratief


In [383]:
diagnoses_data['event:concept:name'].nunique()

214

In [384]:
diagnoses_data['new:event:concept:name'].nunique()

63

In [385]:
## Delete  the old event name
diagnoses_data = diagnoses_data[['case:concept:name', 'time:timestamp', 'diagnosis', 'new:event:concept:name']]

In [386]:
## Keep one instance per day for every activity
diagnoses_data = diagnoses_data.drop_duplicates()

In [387]:
## Found out that brachytherapie and teletherapie are radiotherapy zo changing them
diagnoses_data.loc[diagnoses_data['new:event:concept:name'] == 'brachytherapie', 'new:event:concept:name'] = 'radiotherapie'
diagnoses_data.loc[diagnoses_data['new:event:concept:name'] == 'teletherapie', 'new:event:concept:name'] = 'radiotherapie'

## Remove activity 'laboratoriumverrichtingen'

In [388]:
diagnoses_data =diagnoses_data[diagnoses_data['new:event:concept:name'] != 'laboratoriumverrichtingen']

## Remove cases without a 'Eerste consult' activity

In [389]:
len(diagnoses_data[diagnoses_data['new:event:concept:name'] == 'eerste consult'])

105

In [390]:
## Creating a list with all cases that contain the activity first consult
cases = []

for index, row in diagnoses_data.iterrows():
    case = row['case:concept:name']
    event = row['new:event:concept:name']
    if event == 'eerste consult':
        cases.append(case)

In [391]:
## Removing all cases without first consult from the frame
diagnoses_data = diagnoses_data[diagnoses_data['case:concept:name'].isin(cases)]

In [392]:
diagnoses_data['case:concept:name'].nunique()

55

In [393]:
len(diagnoses_data)

2604

## Remove all activities before the date of 'eerste consult'

In [394]:
## We create a dataframe with only the first consult events
first_consults = diagnoses_data.loc[diagnoses_data['new:event:concept:name'].str.contains('eerste consult')]

In [396]:
## Keep only the consults with the earliest date in this frame
first_consults = first_consults.groupby('case:concept:name')['time:timestamp'].agg(['first']).reset_index()

In [397]:
first_consults = first_consults[['case:concept:name', 'first']]

In [398]:
## We merge the first consult dates with our frame
diagnoses_data = diagnoses_data.merge(first_consults, on = 'case:concept:name', how = 'left')

In [405]:
diagnoses_data = diagnoses_data[diagnoses_data['time:timestamp'] >= diagnoses_data['first'] ]

## Set first consult as artifical starting point

In [401]:
## We use first consult as starting point, so we remain only the first consult on the first day
diagnoses_data= diagnoses_data.drop(diagnoses_data.index[(diagnoses_data['new:event:concept:name'] == 'eerste consult') 
                                                             & (diagnoses_data['time:timestamp'] != diagnoses_data['first'])])


In [409]:
## Convert timestap to datetime
diagnoses_data['time:timestamp']= pd.to_datetime(diagnoses_data['time:timestamp'])

In [410]:
## Set the value of first conult one hour early
diagnoses_data.loc[diagnoses_data['new:event:concept:name'] == 'eerste consult', 'time:timestamp'] = diagnoses_data['time:timestamp'] - pd.DateOffset(days=1)

In [413]:
### Save the dataset for further use 
diagnoses_data.to_csv('Data/final_diagnoses_set.csv')