In [0]:
from __future__ import print_function
from keras.models import Sequential
from keras import layers
from keras.layers import LSTM,TimeDistributed,Dense,RepeatVector
import numpy as np
import random
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [0]:
#All the numerical numbers
num='0123456789'
#All the characters
chars=' 0123456789-+*/'

In [0]:
char_len=len(chars)
char_indice=dict([(c,i) for i,c in enumerate(sorted(chars))])
indices_char=dict([(i,c) for i,c in enumerate(sorted(chars))])
Max_length=9 
# 3 digits + symbol + 3 digits
Max_out=7
# 6 digits + if (-negative sign) -999*999= -998001


In [0]:
def generate_symbol():
  sym=['+','-','*','/']
  return sym[random.randint(0,3)]



In [0]:
#Function that generates train data
def generate_data(count):
  print('Generating {} data samples'.format(count))
  X=[]
  y=[]
  while(count>0):
    # num='0123456789' We are selecting random 3 digits from this string and then converting it to integer
    in1=int(''.join(random.choices(num,k=random.randint(1,3))))
    in2=int(''.join(random.choices(num,k=random.randint(1,3))))
    
    # Function that generated symbol
    sym=generate_symbol()
    if sym=='+':
      out=in1+in2
      inp=str(in1)+'+'+str(in2)
    elif sym =='-':
      out=in1-in2
      inp=str(in1)+'-'+str(in2)
    elif sym == '*':
      out=in1*in2
      inp=str(in1)+'*'+str(in2)
    elif sym == '/':
      try:
        out=in1//in2
        inp=str(in1)+'/'+str(in2)
      except:
        continue

    out=str(out)
    # We want the length of the input string and output string to be same, hence padding the number with extra spaces
    l1=Max_length-len(inp)
    l2=Max_out-len(out)
    inp=' '*l1+inp
    out=' '*l2+out

    X.append(inp)
    y.append(out)
    count=count-1

  return X,y



In [0]:
generate_data(5)

Generating 5 data samples


(['  514*943', '     99+2', '   57-752', '   307*35', '     49-9'],
 [' 484702', '    101', '   -695', '  10745', '     40'])

In [0]:
training_samples=200000
X,y=generate_data(training_samples)

Generating 200000 data samples


In [0]:
# Encoding each character in input to one hot encoded vector.
def encode(data):
  r=np.zeros((len(data),len(chars)))
  #print(r.shape)
  for i,d in enumerate(data):
      r[i,char_indice[d]]=1
  return r

In [0]:
X_new=np.zeros((training_samples,Max_length,char_len))
y_new=np.zeros((training_samples,Max_out,char_len))

In [0]:
y[0],y[1],y[2]

('      0', '      5', '    749')

In [0]:
# Encoding all the data generated
for i in range(training_samples):
  X_new[i]=encode(X[i])
  y_new[i]=encode(y[i])

In [0]:
# X --  60k samples of max_length 15.
# X_new -- 60k samples of max_length 7 and each of them is one hot encoded to a 15 dim vector.
X_new.shape

(200000, 9, 15)

In [0]:
y_new.shape

(200000, 7, 15)

In [0]:
X_train, X_test, y_train, y_test = train_test_split( X_new, y_new, test_size=0.2, random_state=42)

In [0]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(160000, 9, 15)
(40000, 9, 15)
(160000, 7, 15)
(40000, 7, 15)


In [0]:
model=Sequential()

In [0]:
# LSTM layer of 128 dimension. 
# Input-Shape is (7,12).  
model.add(LSTM(256,input_shape=(Max_length,char_len)))

# For each input (7,12) we will get 128 output vectors. 
# Output shape (None,128)

model.add(RepeatVector(7))
#In order to pass the encoder input to decoder LSTM. LSTM expects sequential data, so is expects input data to be 3 dim.
# Hence we wii be sending the input 4 times, Or we are just duplicating the above code 4 times. So we get (None,7,128)
model.add(LSTM(256,return_sequences=True))
# This is a decoder model, so every time an input is given we should get an output. Hence return_sequences=True should be given.
# As the input is given 7 times, we will get 7 outputs for each LSTM. Total 128 LSTMS hence (None,7,128).

model.add(TimeDistributed(Dense(char_len,activation='softmax')))
# Most important part, now we will give this (None,7,128) to this layer. So for one sequential input, we are getting (7,128) dimension vector.
# 7-- 128 dimension vectors, y1,y2,y3,y4,y5,y6,y7. yi is output at timestep i. So after getting yi we have to apply dense layer and decide its class.
# We have to do this for all the 4 outputs. Hence we add a time distributed dense layer and apply Dense layer on top of this. Which means
# we are applying dense layer on each of yi.



In [0]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 256)               278528    
_________________________________________________________________
repeat_vector_5 (RepeatVecto (None, 7, 256)            0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 7, 256)            525312    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 7, 15)             3855      
Total params: 807,695
Trainable params: 807,695
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [0]:
model.fit(X_train, y_train,batch_size=64,epochs=100,validation_data=(X_test, y_test))

Train on 160000 samples, validate on 40000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
 32320/160000 [=====>........................] - ETA: 1:19 - loss: 0.0186 - accuracy: 0.9945

KeyboardInterrupt: 

In [0]:
model.fit(X_train, y_train,batch_size=64,epochs=100,validation_data=(X_test, y_test))

NameError: ignored

In [0]:
# Decoding the onehot encoded vector and converting data back to strings

def decode(data):
  data=data.argmax(axis=-1)
  out=[]
  for j in range(len(data)):
    out.append(''.join([indices_char[i] for i in data[j]]))
  return out

In [0]:
# train data
decode(X_train[1:10])

['    1*771',
 '  341*577',
 '    0*598',
 '     9*78',
 '     0+46',
 '   73*805',
 '      5+2',
 '    9/989',
 '    2-695']

In [0]:
# y_train 
decode(y_train[1:10])

['  28', ' -58', '  -8', '-423', ' 680', '  62', ' -17', ' -88', '  33']

In [0]:
# model predictions
pred=model.predict(X_train[1:10])


In [0]:
# decoding model predictions
decode(pred)

['  28', ' -58', '  -8', '-423', ' 680', '  62', ' -17', ' -88', '  33']

## Problems with this approach:
In the general case, information about the entire input sequence is necessary in order to start generating the target sequence.

In [0]:
# What I understood is bp happens at each time step. It is not waiting for all the input to be given. 
# The process will start as soon as it sees some input.


In [0]:
# But in language prediction tasks we want the decoder to wait till all the input is given and then start the deocding process.
# Often in most of the cases we don't have fixed input and fixed output like in machine translation