In [13]:
import numpy as np
import EvalScript.evalResult as evalResult
import heapq
import math

BASE = './Data/'
START = 'START'
STOP = 'STOP'
labels = ['O', 'B-negative', 'I-negative', 'B-neutral', 'I-neutral', 'B-positive', 'I-positive']


In [14]:
%config Completer.use_jedi = False

In [15]:
class UnlabelledData:
    def __init__(self, path):
        self.sentences = []
        with open(path, 'r', encoding='utf8') as f:
            current_sentence = []
            for line in f:
                line = line.strip()
                if line == '':
                    self.sentences.append(current_sentence)
                    current_sentence = []
                else:
                    current_sentence.append(line)
            if len(current_sentence):
                self.sentences.append(current_sentence)

In [16]:
class LabelledData:
    def __init__(self, labels, path = None, train=True):
        self.sentences = []
        self.label_to_idx = {labels[i]: i for i in range(len(labels))}
        self.word_to_idx = {"": 0} # for zero-padding variable lengths later

        if path == None:
            return
        with open(path, 'r', encoding='utf8') as f:
            current_sentence = []
            for line in f:
                line = line.strip()
                if line == '':
                    self.sentences.append(current_sentence)
                    current_sentence = []
                else:
                    current_sentence.append(tuple(line.rsplit(maxsplit=1)))
            if len(current_sentence):
                self.sentences.append(current_sentence)
        
        if train:
            self.data = self.get_training_data()
            #self.embeddings = np.random.randn(len(self.word_to_idx), 7)
            self.embeddings = self.get_embeddings(labels)
    
    def get_embeddings(self, labels, k=100):
        # attempt to get a good estimate on embeddings here using emission probabilities
        state_counts = {label: 0 for label in labels}
        word_counts = {}
        emission_counts = {}
        embeddings = np.zeros((len(self.word_to_idx), len(labels)))
        
        for sentence in self.sentences:
            for word, label in sentence:
                word = word.lower()
                emission_counts.setdefault((label, word), 0)
                word_counts.setdefault(word, 0)
                
                emission_counts[(label, word)] += 1
                word_counts[word] += 1
                state_counts[label] += 1
        
        for word, word_idx in self.word_to_idx.items():
            for label, label_idx in self.label_to_idx.items():
                state_count = state_counts[label]
                word_count = word_counts.get(word, k)
                emission_count = emission_counts.get((label, word), k)
                
                embeddings[word_idx][label_idx] = np.exp(np.log(emission_count) - np.log(word_count + k)) 
        
        return embeddings



    
    def get_training_data(self):
        seq = []
        labels = []
        current_idx = 0
        training_data = []
        for sentence in self.sentences:
            for word, label in sentence:
                # make everything lowercase
                word = word.lower()
                if word not in self.word_to_idx:
                    self.word_to_idx[word] = current_idx
                    current_idx += 1

                seq.append(self.word_to_idx[word])
                labels.append(self.label_to_idx[label])
            
            training_data.append((seq.copy(), labels.copy()))
            seq = []
            labels = []

        return training_data 
    
    def write_to_file(self, path):
        with open(path, 'w', encoding='utf8') as f:
            for sentence in self.sentences:
                for data in sentence:
                    print(*data, file=f)
                print(file=f)

In [17]:
class RNN:
    def __init__(self, data: LabelledData, hidden_dim=4, learning_rate=0.05, num_epochs=10):
        self.word2idx = data.word_to_idx
        self.label2idx = data.label_to_idx
        self.embeddings = data.embeddings
        self.data = data.data
        self.labels = list(self.label2idx.keys())
        
        # dimensions to keep track
        self.vocab_size, self.embedding_dim = self.embeddings.shape
        self.output_dim = len(self.label2idx)
        self.hidden_dim = hidden_dim
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.num_samples = len(self.data)
        
        # weights
        self.wxh = np.random.rand(self.embedding_dim, self.hidden_dim)
        self.whh = np.random.rand(self.hidden_dim, self.hidden_dim) 
        self.why = np.random.rand(self.hidden_dim, self.output_dim) 
        self.bh = np.random.rand(self.hidden_dim) 
        self.by = np.random.rand(self.output_dim) 
    
    def pad_sequences(self):
        max_sequence_len = 0
        for sequence, label in self.data:
            curr_len = len(sequence)
            if curr_len > max_sequence_len:
                max_sequence_len = curr_len
        
        for i in range(len(self.data)):
            self.data[i] = ((self.data[i][0] + [0] * (max_sequence_len - len(self.data[i][0]))), self.data[i][1])
        
        return max_sequence_len
    
    def forward(self, sequence, sequence_length):
        embedded = self.embeddings[sequence]
        hidden_states = np.zeros((sequence_length, self.hidden_dim))

        for t in range(sequence_length):
            if t == 0:
                hidden_states[t] = np.tanh(np.dot(embedded[t], self.wxh) + self.bh)
            else:
                hidden_states[t] = np.tanh(np.dot(embedded[t], self.wxh) + np.dot(hidden_states[t-1], self.whh) + self.bh)
            
        output = np.dot(hidden_states, self.why) + self.by
        return self.softmax(output), hidden_states

    def softmax(self, x):
        e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        output = e_x / e_x.sum(axis=-1, keepdims=True)
        return output
        
    
    def train(self):

        for epoch in range(self.num_epochs):
            total_loss = 0
            
            for sequence, label in self.data:
                sequence_length = len(sequence)
                output, hidden_states = self.forward(np.array(sequence), sequence_length)
                label = np.array([np.eye(self.output_dim)[label] for label in label])
                
                episilon = 1e-10
                output = np.clip(output, episilon, 1 - episilon)
                loss = -np.sum(label * np.log(output))
                total_loss += loss
                
                gradient_output = (output - label) 
                gradient_hidden_states = np.dot(gradient_output, self.why.T)
                gradient_why = np.dot(hidden_states.T, gradient_output)
                gradient_by = np.sum(gradient_output, axis=0)
                
                gradient_bh = np.zeros_like(self.bh)
                gradient_wxh = np.zeros_like(self.wxh)
                gradient_whh = np.zeros_like(self.whh)
                
                for t in range(sequence_length - 1, -1, -1):
                    gradient_bh += gradient_hidden_states[t] * (1 - hidden_states[t]**2)
                    gradient_wxh += np.outer(self.embeddings[sequence[t]], gradient_hidden_states[t] * (1 - hidden_states[t]**2))
                
                    if t > 0:
                        gradient_whh += np.outer(hidden_states[t - 1], gradient_hidden_states[t] * (1 - hidden_states[t]**2))
                        gradient_hidden_states[t - 1] += np.dot(gradient_hidden_states[t] * (1 - hidden_states[t]**2), self.whh.T)

                max_gradient_value = 5.0  
                gradient_wxh = np.clip(gradient_wxh, -max_gradient_value, max_gradient_value)
                gradient_whh = np.clip(gradient_whh, -max_gradient_value, max_gradient_value)
                gradient_by = np.clip(gradient_by, -max_gradient_value, max_gradient_value)
                gradient_bh = np.clip(gradient_bh, -max_gradient_value, max_gradient_value)
                gradient_why = np.clip(gradient_why, -max_gradient_value, max_gradient_value)

                self.why -= self.learning_rate * gradient_why
                self.by -= self.learning_rate * gradient_by
                self.wxh -= self.learning_rate * gradient_wxh
                self.whh -= self.learning_rate * gradient_whh
                self.bh -= self.learning_rate * gradient_bh
                    
                    
            avg_loss = total_loss / len(self.data)
            print(f"Epoch {epoch+1}/{self.num_epochs}, Loss: {avg_loss:.4f}")
    
    def predict(self, data):
        res = LabelledData(self.labels, train=False)

        for sentence in data.sentences:
            sequence = []
            for word in sentence:
                embedded = self.word2idx.get(word.lower())
                sequence.append(embedded) if embedded else sequence.append(0)
            
            output = self.forward(sequence, len(sequence))[0]
            labels = []
            for prediction in output:
                labels.append(np.argmax(prediction))
            
            labels = [self.labels[label] for label in labels]
            
            res.sentences.append([(word, label) for word, label in zip(sentence, labels)])
        
        return res

            

In [18]:
for dataset in ['ES', 'RU']:
    train = LabelledData(labels=labels, path=BASE + dataset + '/train')
    dev_in = UnlabelledData(BASE + dataset + '/dev.in')
    #predicted.write_to_file(BASE + dataset + '/dev.p4.out')
    
    model = RNN(train)
    model.train()
    predicted = model.predict(dev_in)
    predicted.write_to_file(BASE + dataset + '/dev.p4.out')
    print(f'{f" {dataset} ":=^30}')
    evalResult.evaluate(BASE + dataset + '/dev.out', BASE + dataset + '/dev.p4.out')
    print('='*30)

Epoch 1/10, Loss: 4.6531
Epoch 2/10, Loss: 3.8594
Epoch 3/10, Loss: 3.4757
Epoch 4/10, Loss: 3.3336
Epoch 5/10, Loss: 3.2462
Epoch 6/10, Loss: 3.1798
Epoch 7/10, Loss: 3.1261
Epoch 8/10, Loss: 3.0879
Epoch 9/10, Loss: 3.0625
Epoch 10/10, Loss: 3.0429
#Entity in gold data: 229
#Entity in prediction: 344

#Correct Entity : 164
Entity  precision: 0.4767
Entity  recall: 0.7162
Entity  F: 0.5724

#Correct Sentiment : 125
Sentiment  precision: 0.3634
Sentiment  recall: 0.5459
Sentiment  F: 0.4363
Epoch 1/10, Loss: 3.8368
Epoch 2/10, Loss: 2.7203
Epoch 3/10, Loss: 2.4646
Epoch 4/10, Loss: 2.3620
Epoch 5/10, Loss: 2.3117
Epoch 6/10, Loss: 2.2542
Epoch 7/10, Loss: 2.2185
Epoch 8/10, Loss: 2.1888
Epoch 9/10, Loss: 2.1748
Epoch 10/10, Loss: 2.1616
#Entity in gold data: 389
#Entity in prediction: 374

#Correct Entity : 124
Entity  precision: 0.3316
Entity  recall: 0.3188
Entity  F: 0.3250

#Correct Sentiment : 87
Sentiment  precision: 0.2326
Sentiment  recall: 0.2237
Sentiment  F: 0.2280
