<a href="https://colab.research.google.com/github/MarcoFelipeKing/MockVsActualHospitalCare/blob/main/code/RNN_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
tf.__version__

'2.11.0'

In [2]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import numpy as np
import pandas as pd

In [3]:
# read in the data using pandas ~/Users/marcofking/Documents/GitHub/MockVsActualHospitalCare/MockVsActual_GithubDOI.csv

df = pd.read_csv('../data/MockVsActual_GithubDOI.csv')
df.head()

Unnamed: 0,ActivityID,CareType,HCWType,RoomType,Surface,Date,Time,Dev.Date.Time,type
0,1,IV,RN,SB,AlcOutside,08/03/2019,27/07/2020 11:08,08/03/2019 11:08,Mock
1,1,IV,RN,SB,In,08/03/2019,27/07/2020 11:08,08/03/2019 11:08,Mock
2,1,IV,RN,SB,Door,08/03/2019,27/07/2020 11:08,08/03/2019 11:08,Mock
3,1,IV,RN,SB,Door,08/03/2019,27/07/2020 11:08,08/03/2019 11:08,Mock
4,1,IV,RN,SB,Other,08/03/2019,27/07/2020 11:08,08/03/2019 11:08,Mock


In [4]:
# Group by 'ActivityID', 'CareType', 'type' and extract the sequences
sequences = df.sort_values(['Dev.Date.Time']).groupby(['ActivityID', 'CareType', 'type'])['Surface'].apply(list)

# Add start and end tokens to each sequence
sequences = [['start'] + sequence + ['end'] for sequence in sequences]
sequences[:2]
# Convert the grouped data back to a list of sequences
#sequences = sequences.tolist()



[['start',
  'GlovesOn',
  'In',
  'Patient',
  'Patient',
  'Sharps',
  'Patient',
  'BloodObsEq',
  'Syringe',
  'Patient',
  'Patient',
  'Patient',
  'Sharps',
  'Syringe',
  'Sharps',
  'Patient',
  'Patient',
  'Patient',
  'Patient',
  'Syringe',
  'Patient',
  'Sharps',
  'Patient',
  'Patient',
  'Sharps',
  'Out',
  'Sharps',
  'end'],
 ['start',
  'AlcOutside',
  'Door',
  'Door',
  'Other',
  'In',
  'Table',
  'Table',
  'Tray',
  'Table',
  'Table',
  'Table',
  'Table',
  'Table',
  'Table',
  'Table',
  'Table',
  'Patient',
  'Table',
  'Sharps',
  'Sharps',
  'Table',
  'Table',
  'Patient',
  'Table',
  'PaperTowel',
  'Waste',
  'Soap',
  'Sink',
  'Waste',
  'Sink',
  'Door',
  'Door',
  'Door',
  'Patient',
  'Door',
  'Door',
  'Table',
  'IV',
  'IV',
  'Waste',
  'IV',
  'Out',
  'Door',
  'Other',
  'Patient',
  'end']]

In [5]:
# We need to encode these string labels into integers for the model
encoder = LabelEncoder()
encoder.fit([surface for sequence in sequences for surface in sequence])
sequences_encoded = [encoder.transform(sequence) for sequence in sequences]

# Since RNNs require input sequences to be of the same length, we'll pad the sequences with zeros
sequences_padded = pad_sequences(sequences_encoded)

# Prepare inputs and targets
X = sequences_padded[:, :-1]  # all but the last surface
y = sequences_padded[:, -1]  # only the last surface

# Convert targets to one-hot encoding
y = np_utils.to_categorical(y)

# Define the model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(X.shape[1], 1)))  # 50 is the number of recurrent units
model.add(Dense(y.shape[1], activation='softmax'))  # output layer

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Reshape input to be [samples, time steps, features] which is required for RNNs
X = np.reshape(X, (X.shape[0], X.shape[1], 1))

# Train the model
model.fit(X, y, epochs=30, batch_size=32)

2023-07-21 09:36:36.313349: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7ff36163dee0>

## Model Prediction Accuracy

In [None]:
y_true[:15]

array([32, 31, 14, 29, 12, 29, 39, 26, 39, 31, 24, 25, 22, 31, 22])

In [6]:
# Use the model to predict the next surface for each sequence
y_pred = model.predict(X)

# Convert the predictions from one-hot encoding to integers
y_pred = np.argmax(y_pred, axis=1)

# Convert the actual targets from one-hot encoding to integers
y_true = np.argmax(y, axis=1)

# Calculate the accuracy of the predictions
accuracy = np.mean(y_pred == y_true)

print(f'Accuracy: {accuracy * 100:.2f}%')

# Print out some of the predicted and actual sequences for comparison
for i in range(5):  # print the first 5 sequences
    print(f'Sequence {i+1}:')
    print('Predicted:', y_pred[i])
    print('Actual:', y_true[i])
    print()

Accuracy: 100.00%
Sequence 1:
Predicted: 44
Actual: 44

Sequence 2:
Predicted: 44
Actual: 44

Sequence 3:
Predicted: 44
Actual: 44

Sequence 4:
Predicted: 44
Actual: 44

Sequence 5:
Predicted: 44
Actual: 44



In [8]:
# Create a dictionary to store the counts of each transition
transition_counts = {}

for sequence in sequences:
    for i in range(len(sequence) - 1):
        # Get the current surface and the next surface
        current_surface = sequence[i]
        next_surface = sequence[i+1]

        # Create a tuple for the transition
        transition = (current_surface, next_surface)

        # Increment the count for this transition
        transition_counts[transition] = transition_counts.get(transition, 0) + 1

# Create lists to store the source, target, and value for each transition
sources = []
targets = []
values = []

for transition, count in transition_counts.items():
    source, target = transition
    sources.append(source)
    targets.append(target)
    values.append(count)

# Create a list of all unique surfaces
surfaces = list(set(sources + targets))

# Convert the sources and targets to indices
source_indices = [surfaces.index(source) for source in sources]
target_indices = [surfaces.index(target) for target in targets]

# Save the counts in a dataframe

# Convert the transition_counts dictionary to a DataFrame
df_transitions = pd.DataFrame.from_records([(source, target, count) for (source, target), count in transition_counts.items()], columns=['Source', 'Target', 'Count'])

# Save the DataFrame to a CSV file
df_transitions.to_csv('../data/transitions.csv', index=False)


## Transition Matrices for Observed and Predicted Data

In [10]:
from collections import defaultdict

# Initialize a dictionary to store the transition counts
transition_counts = defaultdict(int)

# Iterate over the sequences
for sequence in sequences:
    # Iterate over the transitions in the sequence
    for i in range(len(sequence) - 1):
        # Increment the count for this transition
        transition_counts[(sequence[i], sequence[i+1])] += 1

# Initialize a dictionary to store the transition probabilities
transition_probs = defaultdict(float)

# Calculate the total number of transitions from each state
total_transitions = defaultdict(int)
for (state_from, state_to), count in transition_counts.items():
    total_transitions[state_from] += count

# Calculate the transition probabilities
for (state_from, state_to), count in transition_counts.items():
    transition_probs[(state_from, state_to)] = count / total_transitions[state_from]

# Now, transition_probs is a dictionary where the keys are tuples (state_from, state_to)
# and the values are the probabilities of transitioning from state_from to state_to


In [11]:
transition_probs

defaultdict(float,
            {('start', 'GlovesOn'): 0.06097560975609756,
             ('GlovesOn', 'In'): 0.13725490196078433,
             ('In', 'Patient'): 0.07784431137724551,
             ('Patient', 'Patient'): 0.3638211382113821,
             ('Patient', 'Sharps'): 0.016260162601626018,
             ('Sharps', 'Patient'): 0.2962962962962963,
             ('Patient', 'BloodObsEq'): 0.032520325203252036,
             ('BloodObsEq', 'Syringe'): 0.003816793893129771,
             ('Syringe', 'Patient'): 0.3181818181818182,
             ('Sharps', 'Syringe'): 0.14814814814814814,
             ('Syringe', 'Sharps'): 0.045454545454545456,
             ('Patient', 'Syringe'): 0.07723577235772358,
             ('Sharps', 'Out'): 0.07407407407407407,
             ('Out', 'Sharps'): 0.012195121951219513,
             ('Sharps', 'end'): 0.037037037037037035,
             ('start', 'AlcOutside'): 0.04878048780487805,
             ('AlcOutside', 'Door'): 0.13636363636363635,
             (

In [None]:
pip install --upgrade nbformat


Collecting nbformat
  Downloading nbformat-5.9.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.5 MB/s eta 0:00:011
Collecting fastjsonschema
  Downloading fastjsonschema-2.17.1-py3-none-any.whl (23 kB)
Collecting jsonschema>=2.6
  Downloading jsonschema-4.18.4-py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 7.6 MB/s  eta 0:00:01
[?25hCollecting jsonschema-specifications>=2023.03.6
  Downloading jsonschema_specifications-2023.7.1-py3-none-any.whl (17 kB)
Collecting attrs>=22.2.0
  Downloading attrs-23.1.0-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 11.3 MB/s eta 0:00:01
[?25hCollecting rpds-py>=0.7.1
  Downloading rpds_py-0.9.2-cp38-cp38-macosx_10_7_x86_64.whl (311 kB)
[K     |████████████████████████████████| 311 kB 32.8 MB/s eta 0:00:01
[?25hCollecting referencing>=0.28.4
  Downloading referencing-0.30.0-py3-none-any.whl (25 kB)
Collecting pkgutil-resolve-name>=1.3.10
  Downloading pkgutil_resolv

In [None]:
import plotly.graph_objects as go

# Create nodes
nodes = list(set([node for edge in transition_counts.keys() for node in edge]))

# Create a dictionary that maps each node to a unique index
node_to_index = {node: i for i, node in enumerate(nodes)}

# Create source, target, and value lists
source = [node_to_index[edge[0]] for edge in transition_counts.keys()]
target = [node_to_index[edge[1]] for edge in transition_counts.keys()]
value = list(transition_counts.values())

# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        color="blue"
    ),
    link=dict(
        source=source,
        target=target,
        value=value
    )
)])

# Show the figure
fig.show()
