<a href="https://colab.research.google.com/github/Joshi-Ketaki/MLPlayground/blob/main/Stateless_Shuffling_History_MLMastery.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STUDY STATELESSNESS AND SHUFFLING
# THIS IS ON ONE TO ONE CHAR MAPPING, I.E. PREDICT NEXT CHAR GIVEN CURRENT CHAR
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical

# fix random seed for reproducibility
tf.random.set_seed(7)

# define the raw dataset
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
# create mapping of characters to integers (0-25) and the reverse
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
int_to_char = dict((i, c) for i, c in enumerate(alphabet))

# prepare the dataset of input to output pairs encoded as integers
seq_length = 1
dataX = []
dataY = []
for i in range(0, len(alphabet) - seq_length, 1):
	seq_in = alphabet[i:i + seq_length]
	seq_out = alphabet[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
	print(seq_in, '->', seq_out)

# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (len(dataX), seq_length, 1))

# normalize
# print("before normalization X", X)
# this is 1 to 26
X = X / float(len(alphabet))
# print("after normalization X", X)
# this is convert 1 to 26 as values between 0 and 1.

# one hot encode the output variable
y = to_categorical(dataY)
# print(y)
# array of 0 and 1 where 1 is at the index of the number encoded above

#############################################################
# PREDICT NEXT CHARACHTER BASED ON CURRENT CHARACHTER
# THIS IS  BATCH SIZE 1 OR 1 INPUT SAMPLE AT A TIME
# create and fit the model
# layers stacked on each other
model = Sequential()
model.add(LSTM(32, input_shape=(X.shape[1], X.shape[2])))
# this is pytorch linear layer
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# 1. DATA SHUFFLING:
# Keras shuffles training data after each epoch
# This will disturb the sequential pattern learning
# with default setting i.e. shuffle=true, accuracy = 80% (batch_size=1)
# model.fit(X, y, epochs=500, batch_size= 1, verbose=2)
# Putting shuffle = True, accuracy increases to 88% (this is with batch size=1)
# model.fit(X, y, epochs=500, batch_size= 1, verbose=2, shuffle=False)

# 2. BATCH SIZE, STATEFULLNESS, STATELESSNESS
# Keras default resets state after each batch.
# So, if batch_size = 1 and seq_length > batch_size, state will be reset
# after each batch.
# one way of maintaining state using default settings is batch_size >= length_of_data
# This required more epochs to achive 100% accuracy

# But note that this hacky statefullness makes context available during training, but
# not during testing
# This is 'statefull', shuffle=False, this achieved accuracy = 100%,
model.fit(X, y, epochs=5000, batch_size=len(dataX), verbose=2, shuffle=False)

# This is 'statefull', shuffle=True, this also achieved accuracy = 100%
# model.fit(X, y, epochs=5000, batch_size=len(dataX), verbose=2, shuffle=True)

#Summary:
# Not shuffling data increases accuracy in stateless models.
# If mosel maintains states across batches, then irrespective of shuffling or not shuffling data
# accuracy is maintained
# Therefore, effect of statefull/less >> shuffling
# summarize performance of the model
scores = model.evaluate(X, y, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

# demonstrate some model predictions
for pattern in dataX:
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(len(alphabet))
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	print(seq_in, "->", result)


# demonstrate predicting random patterns
print("Test a Random Pattern:")
for i in range(0,20):
 pattern_index = np.random.randint(len(dataX))
 pattern = dataX[pattern_index]
 x = np.reshape(pattern, (1, len(pattern), 1))
 x = x / float(len(alphabet))
 prediction = model.predict(x, verbose=0)
 index = np.argmax(prediction)
 result = int_to_char[index]
 seq_in = [int_to_char[value] for value in pattern]
 print(seq_in, "->", result)
#############################################################

In [14]:
# STUDY HISTORY- AS FEATURES OR TIMESTEPS
# Naive LSTM to learn three-char window to one-char mapping (
# (DEFAULT: BATCH_SIZE=1 AND SHUFFLE=FALSE)
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical
# fix random seed for reproducibility
tf.random.set_seed(7)
# define the raw dataset
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
# create mapping of characters to integers (0-25) and the reverse
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
int_to_char = dict((i, c) for i, c in enumerate(alphabet))
# prepare the dataset of input to output pairs encoded as integers
seq_length = 3
dataX = []
dataY = []
for i in range(0, len(alphabet) - seq_length, 1):
	seq_in = alphabet[i:i + seq_length]
	seq_out = alphabet[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
	print(seq_in, '->', seq_out)

# 1. CREATES A THREE CHARACHTER FEATURE WINDOW
# reshape X to be [samples, time steps, features]
# example : [[ 3  4  5]]
X = np.reshape(dataX, (len(dataX), 1, seq_length))
#print("CREATES A THREE CHARACHTER FEATURE WINDOW", X)
# accuracy for this is 86.96%
# why: in this setup, we give more context to the network in one TS.
# but network wants more sequences also to learn better.
# this is not given by this setup.

# 2. CREATES A THREE TIMESTEP WINDOW
# reshape X to be [samples, time steps, features]
# example:
# [[ 3]
#  [ 4]
#  [ 5]]
#X = np.reshape(dataX, (len(dataX), seq_length, 1))
#print ("2. CREATES A THREE TIMESTEP WINDOW", X)
# accuracy for this is 100%
# why : here we are giving more sequences to the network (each seq is diff, see the example)
# and more context for learning (three TS for each sample)
# accuracy better.

# normalize
X = X / float(len(alphabet))
# one hot encode the output variable
y = to_categorical(dataY)
# create and fit the model
model = Sequential()
model.add(LSTM(32, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# DEFAULT HAS ACCURACY 86.96%
# SHUFFLE=FALSE ALSO HAS ACCURACY 86.96%
model.fit(X, y, epochs=500, batch_size=1, verbose=2, shuffle=False)
# summarize performance of the model
scores = model.evaluate(X, y, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))
# demonstrate some model predictions
for pattern in dataX:
  # This is for three timesteps window
  #x = np.reshape(pattern, (1, len(pattern), 1))
	# This is for three features window
  x = np.reshape(pattern, (1, 1, len(pattern)))
  x = x / float(len(alphabet))
  prediction = model.predict(x, verbose=0)
  index = np.argmax(prediction)
  result = int_to_char[index]
  seq_in = [int_to_char[value] for value in pattern]
  print(seq_in, "->", result)

ABC -> D
BCD -> E
CDE -> F
DEF -> G
EFG -> H
FGH -> I
GHI -> J
HIJ -> K
IJK -> L
JKL -> M
KLM -> N
LMN -> O
MNO -> P
NOP -> Q
OPQ -> R
PQR -> S
QRS -> T
RST -> U
STU -> V
TUV -> W
UVW -> X
VWX -> Y
WXY -> Z
Epoch 1/500
23/23 - 3s - loss: 3.2688 - accuracy: 0.0435 - 3s/epoch - 111ms/step
Epoch 2/500
23/23 - 0s - loss: 3.2566 - accuracy: 0.0870 - 47ms/epoch - 2ms/step
Epoch 3/500
23/23 - 0s - loss: 3.2502 - accuracy: 0.0870 - 48ms/epoch - 2ms/step
Epoch 4/500
23/23 - 0s - loss: 3.2440 - accuracy: 0.0870 - 49ms/epoch - 2ms/step
Epoch 5/500
23/23 - 0s - loss: 3.2377 - accuracy: 0.0870 - 44ms/epoch - 2ms/step
Epoch 6/500
23/23 - 0s - loss: 3.2313 - accuracy: 0.0870 - 49ms/epoch - 2ms/step
Epoch 7/500
23/23 - 0s - loss: 3.2246 - accuracy: 0.0435 - 44ms/epoch - 2ms/step
Epoch 8/500
23/23 - 0s - loss: 3.2175 - accuracy: 0.0000e+00 - 45ms/epoch - 2ms/step
Epoch 9/500
23/23 - 0s - loss: 3.2100 - accuracy: 0.0000e+00 - 46ms/epoch - 2ms/step
Epoch 10/500
23/23 - 0s - loss: 3.2019 - accuracy: 0.000