# Formula to calculate the number of parameters of LSTM

The formula for trainable parameter in the LSTM layer is is

$4*(n+m+1)*m$

**n** is the dimension of the input vector

**m** is the number of LSTM units in a layer

**1** is the bias parameter


In [1]:
# type: ignore
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input , Dense , LSTM , Embedding , Dropout , add, Conv2D

inputs = Input(shape=(400, 3))
lstm = LSTM(50, activation='relu')(inputs)
outputs = Dense(1)(lstm)
model = Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 400, 3)]          0         
                                                                 
 lstm (LSTM)                 (None, 50)                10800     
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 10,851
Trainable params: 10,851
Non-trainable params: 0
_________________________________________________________________


In [2]:
inputs = Input(shape=(10, 8))
lstm = LSTM(50, activation='relu')(inputs)
outputs = Dense(1)(lstm)
model = Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 10, 8)]           0         
                                                                 
 lstm_1 (LSTM)               (None, 50)                11800     
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 11,851
Trainable params: 11,851
Non-trainable params: 0
_________________________________________________________________


In [4]:
def gen_lstm_model(lstm_param=LSTM(50)):
    inputs = Input(shape=(10, 3))
    lstm = lstm_param(inputs)
    # outputs = Dense(1)(lstm)
    model = Model(inputs=inputs, outputs=lstm)
    # model.summary()
    print('Output shape', model.output_shape)
    print(model.count_params())
    print('*'*100)

gen_lstm_model(LSTM(50))
# use_bias: wheter the model uses a bias vector b: y = Wx + b
gen_lstm_model(LSTM(50, use_bias=False))
# return_sequences: return shape (time_steps, units), output at each timestamp
gen_lstm_model(LSTM(50, return_sequences=True))
# return_states: return [output (units), state_h (units), state_c (units)]. In this case, state_h == output
gen_lstm_model(LSTM(50, return_state=True))
# return_sequences and return_states: 
# return [output (time_steps, units), states_h (units), state_c (units)]. In this case, state_h == output[-1]
gen_lstm_model(LSTM(50, return_state=True, return_sequences=True))

Output shape (None, 50)
10800
****************************************************************************************************
Output shape (None, 50)
10600
****************************************************************************************************
Output shape (None, 10, 50)
10800
****************************************************************************************************
Output shape [(None, 50), (None, 50), (None, 50)]
10800
****************************************************************************************************
Output shape [(None, 10, 50), (None, 50), (None, 50)]
10800
****************************************************************************************************


In this case, the number of parameters of this LSTM layer can be calculated as: $4*(3+50+1)*50=10800$

In [3]:
# inputs (n_samples, n_time_steps, feature_vector)
inputs = tf.random.normal([32, 10, 8])
model.compile()
LSTM(50)(inputs).shape

TensorShape([32, 50])

In [10]:
model = Sequential()
model.add(tf.keras.layers.TimeDistributed(LSTM(64, return_sequences=True), input_shape=(10, 32)))
model.add(tf.keras.layers.TimeDistributed(Dense(10)))

model.compile(optimizer='adam', loss='mse')

model.summary()
model.output_shape

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

# LSTM output when parameters changed

```
tf.keras.layers.LSTM(
    units,
    activation="tanh",
    recurrent_activation="sigmoid",
    use_bias=True,
    kernel_initializer="glorot_uniform",
    recurrent_initializer="orthogonal",
    bias_initializer="zeros",
    unit_forget_bias=True,
    kernel_regularizer=None,
    recurrent_regularizer=None,
    bias_regularizer=None,
    activity_regularizer=None,
    kernel_constraint=None,
    recurrent_constraint=None,
    bias_constraint=None,
    dropout=0.0,
    recurrent_dropout=0.0,
    return_sequences=False,
    return_state=False,
    go_backwards=False,
    stateful=False,
    time_major=False,
    unroll=False,
    **kwargs
)
```



**units** parameter refers to dimensionality of the output. Each LSTM cell (present at a given time_step) takes in input x and forms a hidden state vector a, the length of this hidden vector is what is called the **units** in LSTM.

When **units** is increased, width of the network increase, increases the number of parameters thus takes longer to train. If the **units** is too large, it might lead to overfitting.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
import numpy as np

# Load the Iris dataset
iris = datasets.load_iris()

# The features are stored in the 'data' attribute
X = iris.data
X = np.expand_dims(X, 1)

# The target values are stored in the 'target' attribute
y = iris.target
y = np.expand_dims(y, 1)

# Split the data into training and testing set  s
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model with 50 units
model1 = Sequential()
model1.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1:])))
model1.add(Dense(1))
model1.compile(optimizer='adam', loss='mse')

# Define the model with 100 units
model2 = Sequential()
model2.add(LSTM(100, activation='relu', input_shape=(X_train.shape[1:])))
model2.add(Dense(1))
model2.compile(optimizer='adam', loss='mse')


# Train the model
model1.fit(X_train, y_train, epochs=10, verbose=1)
print("-"*40)
model2.fit(X_train, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
----------------------------------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a0debf74c8>

**use_bias** parameter in an LSTM layer is a boolean that determines whether the layer uses a bias vector.**use_bias** set to True can help the model fit the data better, but it also increases the number of parameters in the model, which can potentially lead to overfitting.

In [6]:
inputs = Input(shape=(400, 3))
lstm = LSTM(50, activation='relu', use_bias=False)(inputs)
outputs = Dense(1)(lstm)
model = Model(inputs=inputs, outputs=outputs)
# print the summary of the model
model.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 400, 3)]          0         
                                                                 
 lstm_11 (LSTM)              (None, 50)                10600     
                                                                 
 dense_4 (Dense)             (None, 1)                 51        
                                                                 
Total params: 10,651
Trainable params: 10,651
Non-trainable params: 0
_________________________________________________________________
