#**Week 14 Assignment2** 
  
Name: Tianrungroj Yossathorn
  
Student ID: 03200437

In [None]:
import numpy as np
import random
import time

## Data Preparation
Using data from Penntree bank just the same as week10's assignment. Previously implemented data preparation is being modified so that words and part of speeches are in integer label encoding.  
  
Summary:

- Train data size: 1460 sentences
- Test data size: 273 sentences

### PennTree bank data preparation
  using previously implemented class in Viterbi algorithm

In [None]:
# pos: part of speech
class PennTree:
    def __init__(self, filenames=None, train_ratio=0.8):
        self.train_ratio=train_ratio
        self.pos2idx={'PAD': 0, 'BOS': 0}
        self.idx2pos=['PAD', 'BOS']
        self.word2idx={'PAD':0, 'UNK': 1, 'NUM': 2}
        self.idx2word=['PAD', 'UNK', 'NUM']
        self.wordCount={}
        self.prob_pos2pos=None
        self.prob_pos2word=None
        self.train_data=[]
        self.test_data=[]
        self.train_x=[]
        self.train_y=[]
        self.test_x=[]
        self.test_y=[]
        random.shuffle(filenames)
        self.initTable(filenames)
        self.num_word=len(self.idx2word)
        self.num_pos=len(self.idx2pos)
        #self.prepOnehot()
        
        
    def getSentences(self, filename):
        with open(filename, 'r') as file:
            lines=[l for l in file.readlines()]
        sentence=""
        sentences=[]
        chkNewSentence=True
        for i in range(len(lines)):
            if i+1<len(lines) and (('===' in lines[i]) or (lines[i]=='\n' and lines[i+1]=='\n')):
                if len(sentence)>1:
                    sentences.append(sentence)
                    sentence=""
            elif lines[i]=='\n':
                continue
            else:
                tmpstr=lines[i]
                if tmpstr[0]=='[':
                    tmpstr=tmpstr[2:] # rm space too
                if tmpstr[-1]=='\n':
                    tmpstr=tmpstr[:-1]
                if tmpstr[-1]==']':
                    tmpstr=tmpstr[:-2] # rm space too
                sentence=sentence+str(tmpstr)+' '
        if len(sentence)>1:
                    sentences.append(sentence)
                    sentence=""
        return sentences
    
    def initTable(self, filenames):
        num_train_file=int(self.train_ratio*len(filenames))
        for filename in filenames:
            sentences=self.getSentences(filename)
            for sentence in sentences:
                poses=self.extractPos(sentence)
                if num_train_file<=0:
                    self.test_data.append(np.array(poses))
                    continue
                else:
                    self.train_data.append(np.array(poses))
                for word, pos in poses:
                    if not word in self.wordCount:
                        self.wordCount[word]=1
                    else:
                        self.wordCount[word]+=1
                    if not pos in self.pos2idx:
                        self.pos2idx[pos]=len(self.idx2pos)
                        self.idx2pos.append(pos)
            num_train_file-=1
            
        for word in self.wordCount:
            count = self.wordCount[word]
            if count==1:
                continue
            self.word2idx[word]=len(self.idx2word)
            self.idx2word.append(word)
        self.prob_pos2pos=np.zeros([len(self.idx2pos), len(self.idx2pos)])
        self.prob_pos2word=np.zeros([len(self.idx2word), len(self.idx2word)])
        

    def extractPos(self, sentence):
        poses=[]
        words=sentence.split(' ')
        words=[w for w in words if not w=='']
        for word in words:
            if not '/' in word:
                continue
            i=-1
            while word[i]!='/':
                i-=1
            pos=word[i+1:]
            w=word[:i].lower()
            w=self.convertIfFloat(w)
            poses.append((w,pos))
        return poses
            
    def convertIfFloat(self,w):
        try:
            tmp=float(w)
            w='NUM'
        except:
            assert(1==1)
        return w
    
    def convertIfUNK(self, word):
        if not word in self.idx2word:
            w='UNK'
        else:
            w=word
        return w
    
    def train(self):
        print(f"Training with {str(len(self.train_data))} sentences.")
        start_time=time.time()
        for poses in self.train_data:
            prev_pos='BOS'
            for word, cur_pos in poses:
                w=self.convertIfUNK(word)
                self.prob_pos2pos[self.pos2idx[prev_pos]][self.pos2idx[cur_pos]]+=1
                self.prob_pos2word[self.pos2idx[cur_pos]][self.word2idx[w]]+=1
                prev_pos=cur_pos

        
        a=self.prob_pos2pos
        self.prob_pos2pos=a/(np.sum(a, axis=1).reshape((a.shape[0],1))+1e-15)
        a=self.prob_pos2word
        self.prob_pos2word=a/(np.sum(a, axis=1).reshape((a.shape[0],1))+1e-15)
        elapsed_time=time.time()-start_time
        print(f"Elapsed time {str(elapsed_time)}s")
        
        

In [None]:
random.seed(1234)
filenames=['/content/drive/MyDrive/UTokyo/3A/IntelligenceSystem/treebank/tagged/wsj_'+str(i).zfill(4)+'.pos' for i in range(1,200)]
P = PennTree(filenames=filenames,train_ratio=0.8)

### Integer Label Encoding

In [None]:
train_x, train_y, test_x, test_y=[], [], [], []
for s in P.train_data:
  sentence, pos = zip(*s)
  train_x.append([P.word2idx[P.convertIfUNK(w)] for w in sentence])
  train_y.append([P.pos2idx[p] for p in pos])
for s in P.test_data:
  sentence, pos = zip(*s)
  test_x.append([P.word2idx[P.convertIfUNK(w)] for w in sentence])
  test_y.append([P.pos2idx[p] for p in pos])

In [None]:
MAX_LEN = len(max(train_x, key=len))
MAX_LEN

430

In [None]:
# padding the sentences so they have the same length which is required before feeding into the keras' RNN model
from keras.preprocessing.sequence import pad_sequences

train_x = pad_sequences(train_x, maxlen=MAX_LEN, padding='post')
train_y = pad_sequences(train_y, maxlen=MAX_LEN, padding='post')
test_x = pad_sequences(test_x, maxlen=MAX_LEN, padding='post')
test_y = pad_sequences(test_y, maxlen=MAX_LEN, padding='post')

### Onehot converter

In [None]:
def to_onehot(sequences, categories):
  onehot_sequences = []
  for s in sequences:
    a=np.array(s)
    tmp = np.zeros((a.size, categories))
    tmp[np.arange(a.size),a] = 1
    onehot_sequences.append(tmp)
  # print(onehot_sequences[0])
  return np.array(onehot_sequences)
 

## RNN model implementation training and discussion

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

### Approach 1
- Normal LSTM
- Integer Label Encoding for words
- padding with 'PAD': 0
  
Test accuracy: 90.74%

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(P.idx2word), output_dim=64))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(len(P.idx2pos))))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          296384    
_________________________________________________________________
lstm (LSTM)                  (None, None, 128)         98816     
_________________________________________________________________
time_distributed (TimeDistri (None, None, 48)          6192      
_________________________________________________________________
activation (Activation)      (None, None, 48)          0         
Total params: 401,392
Trainable params: 401,392
Non-trainable params: 0
_________________________________________________________________


In [None]:
train_y_oh=to_onehot(train_y, len(P.pos2idx))

In [None]:
model.fit(train_x, train_y_oh, batch_size=128, epochs=40, validation_split=0.2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7fbf70725898>

In [None]:
scores = model.evaluate(test_x, to_onehot(test_y, len(P.pos2idx)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")

accuracy: 90.73600769042969


### Approach2
I read some articles and they used bidirectional LSTM so I tried using it.
- Bidirectional LSTM
- Integer Label Encoding for words
- padding with 'PAD':0
  
Test accuracy: 94.30%

In [None]:
model2 = Sequential()
model.add(InputLayer(input_shape=(MAX_LEN, )))
model2.add(Embedding(input_dim=len(P.idx2word), output_dim=64))
model2.add(Bidirectional(LSTM(128, return_sequences=True)))
model2.add(TimeDistributed(Dense(len(P.pos2idx))))
model2.add(Activation('softmax'))
 
model2.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
 
model2.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 64)          296384    
_________________________________________________________________
bidirectional_6 (Bidirection (None, None, 256)         197632    
_________________________________________________________________
time_distributed_7 (TimeDist (None, None, 48)          12336     
_________________________________________________________________
activation_7 (Activation)    (None, None, 48)          0         
Total params: 506,352
Trainable params: 506,352
Non-trainable params: 0
_________________________________________________________________


In [None]:
train_y_oh=to_onehot(train_y, len(P.pos2idx))

In [None]:
model2.fit(train_x, train_y_oh, batch_size=128, epochs=40, validation_split=0.2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7fbf651127f0>

In [None]:
scores = model2.evaluate(test_x, to_onehot(test_y, len(P.pos2idx)))
print(f"{model2.metrics_names[1]}: {scores[1] * 100}")

accuracy: 94.29678916931152


### Discussion
As shown in above sections, the performance of LSTM is better than the Viterbi algorithm (around 86% accuracy). The reason is that RNN structure takes context of every previous words into account in order to make a prediction, as opposed to Viterbi algorithm that based on HMM structure which considers only the adjacent state. 
   
Moreover, bidirectional LSTM outperformed normal LSTM by almost 4% margin. The reason is that, unlike normal LSTM that considers context of previous words, bidirectional LSTM also considers context of words coming after as the structure propagates through time frames both forward and backward directions. 