<a href="https://colab.research.google.com/github/Monica-Lu/Data_Science_Research/blob/main/RNN_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

####UKHSA_region: Geographical Region
####specimen_date: Date when specimens are collected
####n: Count of number of positive Omicron cases on that date
####percent: Percent of tests that were Omicron among all the positive cases
sgtf:
####total: total number of tests given on that date
####conf_low: lower bound of confidence interval for percent
####conf_high: higher bound of confidence interval for percent

data from: https://www.kaggle.com/code/emirhanai/omicron-variant-ml-100-accuracy-second-version/notebook

In [72]:
#upload file
from google.colab import files
uploaded = files.upload()

Saving sgtf_regionepicurve_2021-12-30.csv to sgtf_regionepicurve_2021-12-30 (4).csv


In [73]:
#Parse Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random

df = pd.read_csv("sgtf_regionepicurve_2021-12-30.csv", encoding="utf-8")
print(df.head())

UKHSA_region_group = df.groupby(['UKHSA_region'])
print(UKHSA_region_group.head())

df = df.set_index('specimen_date')
df.index = pd.to_datetime(df.index, unit ='ns')
print(df.index)

df = df.sort_values('specimen_date')


    UKHSA_region specimen_date    n  percent                         sgtf  \
0  East Midlands    01/11/2021  874   100.00  Cases with confirmed S-gene   
1  East Midlands    02/11/2021  724    99.86  Cases with confirmed S-gene   
2  East Midlands    02/11/2021    1     0.14    Cases with confirmed SGTF   
3  East Midlands    03/11/2021  757   100.00  Cases with confirmed S-gene   
4  East Midlands    04/11/2021  598   100.00  Cases with confirmed S-gene   

   total   conf_low  conf_high  
0    874  99.454454  100.00000  
1    725  99.109120   99.99280  
2    725   0.007200    0.89088  
3    757  99.370567  100.00000  
4    598  99.204289  100.00000  
             UKHSA_region specimen_date     n  percent  \
0           East Midlands    01/11/2021   874   100.00   
1           East Midlands    02/11/2021   724    99.86   
2           East Midlands    02/11/2021     1     0.14   
3           East Midlands    03/11/2021   757   100.00   
4           East Midlands    04/11/2021   598   1

In [74]:
#Train, test, split
import numpy as np

T = 14 #input sequence length(how much the RNN is going to remember)
num_pairs = 107 - T
train_size = int(num_pairs*0.7) #70% train 30% test
test_size = num_pairs - train_size

X = []
y = []

total_cases = df['total'].values

for i in range(len(total_cases) - T):
  X.append(total_cases[i:i+T].reshape(-1,1)) # a seq of T values
  y.append(total_cases[i+T]) # target will be right after the sequence

#East_midland Data Manual train test split
X_train = X[:train_size]
y_train = y[:train_size]

X_test = X[train_size:]
y_test = y[train_size:]

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)


In [75]:
#implement rnn
import numpy as np

np.random.seed(0)
class vanilla_many_to_one_RNN(object):
  """
    input_size = 1
    hidden_size = 32
    output_size = 1
  Dimensions
    self.W_hh = (hidden_size, hidden_size) Recurrent Weight
    self.W_xh = (hidden_size, input_size) Maps input to hidden state
    self.W_hy = (output_size, hidden_size) Maps final hidden state to output
    self.b_h = (hidden_size, ) Bias for hidden state
    self.b_y = (output_size, ) Bias for output
  """
  def __init__(self):
    input_size = 1
    hidden_size = 32
    output_size = 1
    self.hidden_state = np.zeros((hidden_size, ))
    self.W_hh = np.random.randn(hidden_size,hidden_size)
    self.W_xh = np.random.randn(hidden_size,input_size)
    self.W_hy = np.random.randn(output_size,hidden_size)
    self.b_h = np.random.randn(hidden_size, )
    self.b_y = np.random.randn(output_size, )

  def forward_prop(self, x_seq):
    h = self.hidden_state
    h_list = []

    for t in range(len(x_seq)):
      x_t = x_seq[t] #x_seq at time t, shape = (input_size, )
      h = np.tanh(self.W_xh @ x_t + self.W_hh @ h + self.b_h) #RNN Forward Props Formula, shape = (hidden_size, )
      h_list.append(h)

    y_hat = self.W_hy @ h + self.b_y #shape = (output_size,)
    return y_hat

  #def back_prop(self):



In [76]:
#implement loss + optimizer
class Loss:
   def mean_squared_error(y_pred, y_true):
      return np.mean((y_pred - y_true) ** 2)

   def mean_avg_error(y_pred, y_true):
      return np.mean(abs(y_pred - y_true))



In [77]:
preds = []
truths = []

for i in range(len(X_train)):
  x_seq = X_train[i]
  y_true = y_train[i]

  model = vanilla_many_to_one_RNN()
  y_pred = model.forward_prop(x_seq)
  preds.append(y_pred)
  truths.append(y_true)

y_preds = np.array(preds)
y_trues = np.array(truths)

mae_loss = Loss.mean_avg_error(y_preds, y_trues)
mse_loss = Loss.mean_squared_error(y_preds, y_trues)
print("MAE:", mae_loss)
print("MSE:", mse_loss)

MAE: 1969.4333117602473
MSE: 4704616.224849808
