In [1]:
import numpy as np
import EvalScript.evalResult as evalResult
import heapq
import math

BASE = './Data/'
START = 'START'
STOP = 'STOP'

In [2]:
%config Completer.use_jedi = False

In [3]:
class UnlabelledData:
    def __init__(self, path):
        self.sentences = []
        with open(path, 'r', encoding='utf8') as f:
            current_sentence = []
            for line in f:
                line = line.strip()
                if line == '':
                    self.sentences.append(current_sentence)
                    current_sentence = []
                else:
                    current_sentence.append(line)
            if len(current_sentence):
                self.sentences.append(current_sentence)

In [4]:
class LabelledData:
    def __init__(self, path = None):
        self.sentences = []
        if path == None:
            return
        with open(path, 'r', encoding='utf8') as f:
            current_sentence = []
            for line in f:
                line = line.strip()
                if line == '':
                    self.sentences.append(current_sentence)
                    current_sentence = []
                else:
                    current_sentence.append(tuple(line.rsplit(maxsplit=1)))
            if len(current_sentence):
                self.sentences.append(current_sentence)
    
    def write_to_file(self, path):
        with open(path, 'w', encoding='utf8') as f:
            for sentence in self.sentences:
                for data in sentence:
                    print(*data, file=f)
                print(file=f)

In [5]:
START = 'START'
class LinearChainCRF:
    def __init__(self):
        self.labels = ['O', 'B-negative', 'I-negative', 'B-neutral', 'I-neutral', 'B-positive', 'I-positive', START]
        self.transition_weights = {}
        self.emission_weights = {}
        self.features = [self.transition_weights, self.emission_weights]
        self.k = 1
        self.state_counts = {START: 0}
    
    def learn(self, data: LabelledData, alpha=0.1):
        # linear chain crf model involves multiple feature functions
        
        transition_counts = {}
        
        emission_counts = {}

        # we model transition probabilities by MLE, Count(y1, y2) / Count(y1)
        for sentence in data.sentences:
            prev = START
            for x, y in sentence:
                self.state_counts.setdefault(y, 0)
                self.state_counts[y] += 1
                emission_counts.setdefault((y, x), 0)
                emission_counts[(y, x)] += 1
                if prev:
                    transition_counts.setdefault((prev, y), 0)
                    transition_counts[(prev, y)] += 1
                prev = y

            self.state_counts[START] += 1
        
        for (y1, y2), count in transition_counts.items():
            self.transition_weights[(y1, y2)] = np.log(count / self.state_counts[y1])
        
        for (y, x), count in emission_counts.items():
            self.emission_weights[(y, x)] = np.log(count / (self.state_counts[y] + self.k) )

        # training transition and emission features
        for sentence in data.sentences:
            prev = START
            for x, y in sentence:
                self.transition_weights.setdefault((prev, y), -math.inf)
                trans_gradient = alpha * (1 - self.transition_weights.get((prev, y)))
                self.transition_weights[(prev, y)] += trans_gradient
                
                self.emission_weights.setdefault((y, x), np.log(self.k / (self.state_counts.get(y) + self.k)) )
                emiss_gradient = alpha * (1 - self.emission_weights.get((y, x)))
                self.emission_weights[(y, x)] += emiss_gradient
    
        pass
    
    def viterbi(self, sentence):
        pi = {(-1, START): 0.0}
        n = len(sentence)

        for i in range(n):
            x = sentence[i]
            for y2 in self.labels:
                self.emission_weights.setdefault((y2, x), np.log(self.k / (self.state_counts.get(y2) + self.k)) )
                maximum = -math.inf
                for y1 in self.labels:
                    current_pi = pi.get((i-1, y1), -math.inf) + self.emission_weights.get((y2, x)) + self.transition_weights.get((y1, y2), -math.inf)
                    if current_pi > maximum:
                        maximum = current_pi

                pi[(i, y2)] = maximum
        
        final_states = []

        for i in range(n-1, -1, -1):
            maximum = -math.inf
            best_label = None
            for label in self.labels:
                if pi.get((i, label)) > maximum:
                    best_label = label
                    maximum = pi.get((i, label))

            final_states.append(best_label)
            

        return list(zip(sentence, final_states[::-1]))

    def label(self, data):
        labelled = LabelledData()
        
        for unlabelled_sentence in data.sentences:
            labelled.sentences.append(self.viterbi(unlabelled_sentence))
        return labelled



In [6]:
for dataset in ['RU', 'ES']:
    train = LabelledData(BASE + dataset + '/train')
    dev_in = UnlabelledData(BASE + dataset + '/dev.in')
    model = LinearChainCRF()
    model.learn(train)
    predicted = model.label(dev_in)
    print(model.transition_weights)
    predicted.write_to_file(BASE + dataset + '/dev.p4.out')
    
    print(f'{f" {dataset} ":=^30}')
    evalResult.evaluate(BASE + dataset + '/dev.out', BASE + dataset + '/dev.p4.out')
    print('='*30)

{('START', 'B-positive'): 0.9999999999999996, ('B-positive', 'O'): -0.21149917159973128, ('O', 'O'): -0.13415905270725983, ('O', 'B-positive'): -3.2904965636574257, ('B-positive', 'I-positive'): -1.6687844068692466, ('I-positive', 'O'): -0.5397146336181278, ('START', 'O'): 0.9999999999999996, ('I-positive', 'I-positive'): -0.8825287767646249, ('O', 'B-negative'): -4.674804826805687, ('B-negative', 'I-negative'): -1.6891053151679716, ('I-negative', 'I-negative'): -0.8712224464724488, ('I-negative', 'O'): -0.5543107357057294, ('START', 'B-neutral'): 0.999999998362583, ('B-neutral', 'O'): -0.14533298742461412, ('B-negative', 'O'): -0.2041803506064537, ('O', 'B-neutral'): -5.653871964824014, ('START', 'B-negative'): 0.9999999999999996, ('B-neutral', 'I-neutral'): -2.0005142830901654, ('I-neutral', 'I-neutral'): -0.5306282510621704, ('I-neutral', 'O'): -0.8873031950009028, ('B-positive', 'B-positive'): -6.428105272684596, ('I-positive', 'B-positive'): -6.391917113392602, ('START', 'I-positi