<a href="https://colab.research.google.com/github/MarcoFelipeKing/MockVsActualHospitalCare/blob/main/RNN_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
tf.__version__

'2.12.0'

In [1]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import numpy as np
import pandas as pd

In [2]:
# read in the data using pandas ~/Users/marcofking/Documents/GitHub/MockVsActualHospitalCare/MockVsActual_GithubDOI.csv

df = pd.read_csv('MockVsActual_GithubDOI.csv')
df.head()

FileNotFoundError: ignored

In [None]:
# Group by 'ActivityID', 'CareType', 'type' and extract the sequences
sequences = df.sort_values(['Dev.Date.Time']).groupby(['ActivityID', 'CareType', 'type'])['Surface'].apply(list)

# Add start and end tokens to each sequence
sequences = [['start'] + sequence + ['end'] for sequence in sequences]
sequences[:2]
# Convert the grouped data back to a list of sequences
#sequences = sequences.tolist()



In [None]:
# We need to encode these string labels into integers for the model
encoder = LabelEncoder()
encoder.fit([surface for sequence in sequences for surface in sequence])
sequences_encoded = [encoder.transform(sequence) for sequence in sequences]

# Since RNNs require input sequences to be of the same length, we'll pad the sequences with zeros
sequences_padded = pad_sequences(sequences_encoded)

# Prepare inputs and targets
X = sequences_padded[:, :-1]  # all but the last surface
y = sequences_padded[:, -1]  # only the last surface

# Convert targets to one-hot encoding
y = np_utils.to_categorical(y)

# Define the model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(X.shape[1], 1)))  # 50 is the number of recurrent units
model.add(Dense(y.shape[1], activation='softmax'))  # output layer

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Reshape input to be [samples, time steps, features] which is required for RNNs
X = np.reshape(X, (X.shape[0], X.shape[1], 1))

# Train the model
model.fit(X, y, epochs=30, batch_size=32)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x78fe01c0c580>

## Model Prediction Accuracy

In [None]:
y_true[:15]

array([32, 31, 14, 29, 12, 29, 39, 26, 39, 31, 24, 25, 22, 31, 22])

In [None]:
# Use the model to predict the next surface for each sequence
y_pred = model.predict(X)

# Convert the predictions from one-hot encoding to integers
y_pred = np.argmax(y_pred, axis=1)

# Convert the actual targets from one-hot encoding to integers
y_true = np.argmax(y, axis=1)

# Calculate the accuracy of the predictions
accuracy = np.mean(y_pred == y_true)

print(f'Accuracy: {accuracy * 100:.2f}%')

# Print out some of the predicted and actual sequences for comparison
for i in range(5):  # print the first 5 sequences
    print(f'Sequence {i+1}:')
    print('Predicted:', y_pred[i])
    print('Actual:', y_true[i])
    print()

In [None]:
import plotly.graph_objects as go

# Create a dictionary to store the counts of each transition
transition_counts = {}

for sequence in sequences:
    for i in range(len(sequence) - 1):
        # Get the current surface and the next surface
        current_surface = sequence[i]
        next_surface = sequence[i+1]

        # Create a tuple for the transition
        transition = (current_surface, next_surface)

        # Increment the count for this transition
        transition_counts[transition] = transition_counts.get(transition, 0) + 1

# Create lists to store the source, target, and value for each transition
sources = []
targets = []
values = []

for transition, count in transition_counts.items():
    source, target = transition
    sources.append(source)
    targets.append(target)
    values.append(count)

# Create a list of all unique surfaces
surfaces = list(set(sources + targets))

# Convert the sources and targets to indices
source_indices = [surfaces.index(source) for source in sources]
target_indices = [surfaces.index(target) for target in targets]

# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=surfaces,
    ),
    link=dict(
        source=source_indices,
        target=target_indices,
        value=values,
    )
)])

fig.show()
