<a href="https://colab.research.google.com/github/Gazda70/DeepLearningInPython/blob/main/recurrent_network_words2numbers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model transforming words into a number
- input: sequence of letters
- output: number


Examples:
- input: sequence 'one hundred' output: 100
- input: sequence 'two hundred fourteen' output: 214

The code in file number2words.py taken from: https://www.codesansar.com/python-programming-examples/number-words-conversion-no-library-used.htm

In [None]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
import numpy as np
#from number2words import getWords

In [None]:
def process(number, index):

    if number=='0':
        return 'zero'

    length = len(number)

    if(length > 3):
        return False

    number = number.zfill(3)
    words = ''

    hdigit = int(number[0])
    tdigit = int(number[1])
    odigit = int(number[2])

    words += '' if number[0] == '0' else ones[hdigit]
    words += ' hundred ' if not words == '' else ''

    if(tdigit > 1):
        words += tens[tdigit - 2]
        words += ' '
        words += ones[odigit]

    elif(tdigit == 1):
        words += twos[(int(tdigit + odigit) % 10) - 1]

    elif(tdigit == 0):
        words += ones[odigit]

    if(words.endswith('zero')):
        words = words[:-len('zero')]
    else:
        words += ' '

    if(not len(words) == 0):
        words += suffixes[index]

    return words;

In [None]:
def getWords(number):
    length = len(str(number))

    if length>12:
        return 'This program supports upto 12 digit numbers.'

    count = length // 3 if length % 3 == 0 else length // 3 + 1
    copy = count
    words = []

    for i in range(length - 1, -1, -3):
        words.append(process(str(number)[0 if i - 2 < 0 else i - 2 : i + 1], copy - count))
        count -= 1;

    final_words = ''
    for s in reversed(words):
        temp = s + ' '
        final_words += temp

    return final_words

In [None]:
class ToRoman(int):
    def __new__(cls, number):
        if number > 3999:
            raise ValueError('Values over 3999 are not allowed: {}'.format(number))
        if number < 0:
            raise ValueError('Negative values are not allowed: {}'.format(number))
        return super().__new__(cls, number)

    def __init__(self, number):
        to_roman = {1: 'I', 2: 'II', 3: 'III', 4: 'IV', 5: 'V',
                6: 'VI', 7: 'VII', 8: 'VIII', 9: 'IX', 10: 'X', 20: 'XX',
                30: 'XXX', 40: 'XL', 50: 'L', 60: 'LX', 70: 'LXX', 80: 'LXXX',
                90: 'XC', 100: 'C', 200: 'CC', 300: 'CCC', 400: 'CD', 500: 'D',
                600: 'DC', 700: 'DCC', 800: 'DCCC', 900: 'CM', 1000: 'M',
                2000: 'MM', 3000: 'MMM'}
        self.roman = ''.join([to_roman.get(num) for num in self][::-1])

    def __iter__(self):
        number = self.__str__()
        count = 1
        for digit in number[::-1]:
            if digit != '0':
                yield int(digit) * count
            count *= 10

class ToArabic(str):
    def __init__(self, roman):
        roman = self.check_valid(roman)
        keys = ['IV', 'IX', 'XL', 'XC', 'CD', 'CM', 'I', 'V', 'X', 'L', 'C', 'D', 'M']
        to_arabic = {'IV': '4', 'IX': '9', 'XL': '40', 'XC': '90', 'CD': '400', 'CM': '900',
                'I': '1', 'V': '5', 'X': '10', 'L': '50', 'C': '100', 'D': '500', 'M': '1000'}
        for key in keys:
            if key in roman:
                roman = roman.replace(key, ' {}'.format(to_arabic.get(key)))
        self.arabic = sum(int(num) for num in roman.split())

    def check_valid(self, roman):
        roman = roman.upper()
        invalid = ['IIII', 'VV', 'XXXX', 'LL', 'CCCC', 'DD', 'MMMM']
        if any(sub in roman for sub in invalid):
            raise ValueError('Numerus invalidus est: {}'.format(roman))
        return roman

def convert(number):
    if isinstance(number, int):
        num = ToRoman(number)
        return num.roman
    num = ToArabic(number)
    return num.arabic

## Model

In [None]:
model = Sequential()
model.add(LSTM(128,input_shape=(None,1),return_sequences=True)) # sequences of singlen numbers
model.add(LSTM(128))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer="adam",metrics=['mae','mse'])
num_epochs = 0
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, None, 128)         66560     
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 198,273
Trainable params: 198,273
Non-trainable params: 0
_________________________________________________________________


## Dataset creation

### Helper methods

In [None]:
# helper method, converts sequence of numbers to text
def to_text(sample):
    return ''.join([idx2char[int(x)] for x in sample])
# helper method, converts text to sequence of numbers
def to_number(words):
    return np.array([char2idx[char] for char in words])

### Dataset - **samples** and **labels**

In [None]:
DATASET_SIZE=200

samples = []
labels = []
all_words = ''
max_len = 0
for i in range(DATASET_SIZE):
    labels.append(i)
    #words = lslownie(i)
    words = convert(i)
    samples.append(words)
    all_words += words
    if len(words)>max_len:
        max_len = len(words)

print('Max len of text',max_len)
vocab = sorted(set(all_words))
vocab_size = len(vocab)
print('vocabulary (used letters)',vocab)
print ('unique characters',vocab_size)

Max len of text 9
vocabulary (used letters) ['C', 'I', 'L', 'V', 'X']
unique characters 5


#### Creating a mapping from unique characters to indices

In [None]:
char2idx = {char:index for index, char in enumerate(vocab)}
print('char2idx:\n',char2idx)
idx2char = np.array(vocab)
print('idx2char\n',idx2char)

char2idx:
 {'C': 0, 'I': 1, 'L': 2, 'V': 3, 'X': 4}
idx2char
 ['C' 'I' 'L' 'V' 'X']


#### Convert letters to numbers using char2idx

In [None]:
samples_int = []
for s in samples:
    v = np.array([char2idx[char] for char in s])
    samples_int.append(v) # different sizes!
print(samples[123],' ->becomes-> ',samples_int[123])

CXXIII  ->becomes->  [0 4 4 1 1 1]


#### From list of lists to numpy - must have a fixed number of characters (30 -> max_len)

In [None]:
samples = np.zeros((DATASET_SIZE,max_len))
for i in range(len(samples_int)):
    for j in range(len(samples_int[i])):
        samples[i,j] = np.array(samples_int[i][j]) # all not used have '0' which is ' '
print('SAMPLES\n\n',samples)
print(samples.shape)

SAMPLES

 [[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 ...
 [0. 4. 0. ... 0. 0. 0.]
 [0. 4. 0. ... 1. 0. 0.]
 [0. 4. 0. ... 0. 0. 0.]]
(200, 9)


In [None]:
samples = np.expand_dims(samples,axis=2) #add the third dimension
labels = np.array(labels,dtype=float)

print("Sample (for 123):\n",samples[123])
print("Sample decoded",to_text(samples[123]))
print("Label (output):",labels[123])

print('samples shape',samples.shape)
print('labels shape',labels.shape)

Sample (for 123):
 [[0.]
 [4.]
 [4.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]]
Sample decoded CXXIIICCC
Label (output): 123.0
samples shape (200, 9, 1)
labels shape (200,)


In [None]:
TRAINING_SIZE = .5
from sklearn.model_selection import train_test_split
(trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels,train_size=TRAINING_SIZE, random_state=1)
print('Training samples:',len(trainSamples),' test samples',len(testSamples))

Training samples: 100  test samples 100


In [None]:
import random

def check_model(verbose=0,how_many=10):
    pred = model.predict(samples)
    print('text => [predicted value] error=[error]')
    error = []
    for i in range(len(pred)):
        res = samples[i]
        error.append(abs(i-pred[i]))
        if verbose==1:
            train = ''
            if i in trainLabels: train='[T]'
            print(i,to_text(res),'=> {:.2f} error = {:.2f}'.format(pred[i,0],abs(i-pred[i,0])),train)
    if verbose<1: # if not verbose just display 'how_many' random samples
        for i in range(how_many):
            x = random.randrange(DATASET_SIZE)
            res = samples[x]
            print(to_text(res),'=>  {:.2f} error = {:.2f}'.format(pred[x,0],abs(x-pred[x,0])))
    print('Mean error =',np.mean(error))
    return np.mean(error)
check_model(1)

text => [predicted value] error=[error]
0 CCCCCCCCC => 0.00 error = 0.00 [T]
1 ICCCCCCCC => -0.00 error = 1.00 [T]
2 IICCCCCCC => -0.00 error = 2.00 [T]
3 IIICCCCCC => -0.01 error = 3.01 [T]
4 IVCCCCCCC => -0.01 error = 4.01 
5 VCCCCCCCC => -0.01 error = 5.01 
6 VICCCCCCC => -0.01 error = 6.01 [T]
7 VIICCCCCC => -0.01 error = 7.01 [T]
8 VIIICCCCC => -0.01 error = 8.01 [T]
9 IXCCCCCCC => -0.01 error = 9.01 [T]
10 XCCCCCCCC => -0.01 error = 10.01 [T]
11 XICCCCCCC => -0.01 error = 11.01 
12 XIICCCCCC => -0.01 error = 12.01 
13 XIIICCCCC => -0.02 error = 13.02 
14 XIVCCCCCC => -0.02 error = 14.02 
15 XVCCCCCCC => -0.02 error = 15.02 [T]
16 XVICCCCCC => -0.02 error = 16.02 
17 XVIICCCCC => -0.02 error = 17.02 
18 XVIIICCCC => -0.02 error = 18.02 
19 XIXCCCCCC => -0.02 error = 19.02 
20 XXCCCCCCC => -0.02 error = 20.02 [T]
21 XXICCCCCC => -0.02 error = 21.02 [T]
22 XXIICCCCC => -0.02 error = 22.02 [T]
23 XXIIICCCC => -0.02 error = 23.02 [T]
24 XXIVCCCCC => -0.03 error = 24.03 [T]
25 XXVCCCCC

99.52142

In [None]:
EPOCHS=1000
BATCH_SIZE = int(len(trainSamples)/4)
print('Training with',len(trainSamples),'samples',EPOCHS,'epochs and batch_size=',BATCH_SIZE)
for x in range(10):
    H = model.fit(trainSamples, trainLabels, epochs=EPOCHS,verbose=0,batch_size=BATCH_SIZE)
    num_epochs += EPOCHS
    print("\n{}/10 Epochs: {} - loss={:6.3f}, loss improvement={:6.3f}".
          format(x, num_epochs,H.history['loss'][-1], H.history['loss'][0]-H.history['loss'][-1]))
    check_model()
print("Done")

Training with 100 samples 1000 epochs and batch_size= 25

0/10 Epochs: 2000 - loss=50.967, loss improvement=19.027
text => [predicted value] error=[error]
CLXVIIICC =>  166.86 error = 1.14
CXLIICCCC =>  142.45 error = 0.45
CIIICCCCC =>  100.33 error = 2.67
XCVIIICCC =>  86.30 error = 11.70
XVIICCCCC =>  23.68 error = 6.68
CXCVIICCC =>  194.34 error = 2.66
XCIIICCCC =>  89.48 error = 3.52
XVIIICCCC =>  26.87 error = 8.87
CIIICCCCC =>  100.33 error = 2.67
CLIVCCCCC =>  151.64 error = 2.36
Mean error = 6.932182

1/10 Epochs: 3000 - loss=50.549, loss improvement= 0.477
text => [predicted value] error=[error]
CXXXIIICC =>  132.33 error = 0.67
CXICCCCCC =>  110.89 error = 0.11
VICCCCCCC =>  6.02 error = 0.02
CLXXXVIII =>  188.03 error = 0.03
XCIICCCCC =>  92.27 error = 0.27
CXCICCCCC =>  189.82 error = 1.18
VICCCCCCC =>  6.02 error = 0.02
XXXVIIICC =>  38.49 error = 0.49
CLXXVCCCC =>  175.87 error = 0.87
IVCCCCCCC =>  20.07 error = 16.07
Mean error = 6.0975103

2/10 Epochs: 4000 - loss=51.07

In [None]:

#x = to_number('one hundred thirty nine      ')
#x = to_number('one hundred ninety nine     ')
x = to_number('XX')

#              xxxxxxxxxxxxxxxxxxxxxxxxxxxxx
x = np.expand_dims(x,axis=1)
x = np.expand_dims(x,axis=0)
model.predict(x)



array([[111.03359]], dtype=float32)

In [None]:
model.save('model_words2numbers.h5')