In [4]:
from unidecode import unidecode
from collections import defaultdict
from math import sqrt

In [5]:
def normalize(text):
    return unidecode(text.lower())

def strip_non_alphanum(text):
    return ''.join([ c for c in filter(lambda c: c == ' ' or c.isalnum(), text) ]).rstrip(' ')

In [6]:
def load_bigrams_norm(filename, k=10):
    bigrams = defaultdict(lambda: {}, {})
    normalized_bigrams = defaultdict(lambda: defaultdict(lambda: 0, {}), {})
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            n, word1, word2 = line.rstrip('\n').split(' ')
            n = int(n)

            if n < k: continue

            bigrams[word1][word2] = n

            norm_word1 = normalize(word1)
            word1_without_nonalnum = strip_non_alphanum(word1)
            norm_word2 = normalize(word2)
            word2_without_nonalnum = strip_non_alphanum(word2)

            normalized_bigrams[norm_word1][word1_without_nonalnum] += 1
            normalized_bigrams[norm_word2][word2_without_nonalnum] += 1
    return bigrams, normalized_bigrams     

In [7]:
# bigrams, normalized_bigrams = load_bigrams_norm('./data/poleval_2grams.txt', 5)
bigrams, normalized_bigrams = load_bigrams_norm('./data/teaching_2grams.txt', 0)

In [8]:
def bigram_to_unigram(bigram):
    unigram = defaultdict(lambda: 0, {})
    for word in bigram:
        for successor in bigram[word]:
            count = bigram[word][successor]
            unigram[word] += count
            unigram[successor] += count
    return unigram

In [9]:
unigrams = bigram_to_unigram(bigrams)

In [10]:
def flatten(xss):
    return [x for xs in xss for x in xs]


def get_word_alts(word, normalized_bigrams):
    if word in normalized_bigrams:
        return normalized_bigrams[word]
    return { word: 1 }


def get_alts(words, normalized_bigrams):
    return list(map(lambda word: get_word_alts(word, normalized_bigrams), words))


def best_predecessor_score(word, predecessors, bigrams):
    best = (-1, None)
    for predecessor in predecessors:
        if predecessor in bigrams and word in bigrams[predecessor]:
            candidate = (bigrams[predecessor][word], predecessor)
            best = max(best, candidate)
    return best


def best_successor_score(word, successors, bigrams):
    best = (-1, None)
    if word in bigrams:
        for successor in successors:
            if successor in bigrams[word]:
                candidate = (bigrams[word][successor], successor)
                best = max(best, candidate)
    return best

def best_unigram_score(words, unigrams):
    best = (-1, None)
    for word in words:
        candidate = (unigram_score(word, unigrams), word)
        best = max(best, candidate)
    return best


def unigram_score(word, unigrams):
    return unigrams[word] if word in unigrams else 0

In [11]:
from random import choice


def restore_diactrics(sentence, bigrams, normalized_bigrams, unigrams):
    alts = get_alts(sentence.split(' '), normalized_bigrams)

    score = [{}]
    prev = [{}]
    for alt in alts[0]:
        score[0][alt] = 1
        prev[0][alt] = '<START>'
    
    for i in range(1, len(alts)):
        score.append({})
        prev.append({})

        for alt in alts[i]:
            s, word = best_predecessor_score(alt, alts[i-1], bigrams)  
            if word is None:
                s, word = best_unigram_score(alts[i-1], unigrams)
            else:
                s += unigram_score(word, unigrams)
            score[-1][alt] = s
            prev[-1][alt] = word
    
    val, last_word = max([ (c, w) for (w, c) in score[-1].items() ])
    result = [last_word]

    for i in range(len(alts)-1, 0, -1):
        result.append(prev[i][result[-1]])
    
    result.reverse()
    return ' '.join([result[0].capitalize()] + result[1:]) + '.'


In [12]:
sentence = "Uderzył go w żebra"
normalized_sentence = normalize(sentence)
restore_diactrics(normalized_sentence, bigrams, normalized_bigrams, unigrams)

'Uderzył go w żebra.'

In [13]:
def get_correctness(original, attempt):
    score = 0
    for i in range(len(original)):
        if original[i] == attempt[i]:
            score += 1
    return score


def diacritic_correctness(original, attempt):
    original = original.rstrip('.').lower().split(' ')
    attempt = attempt.rstrip('.').lower().split(' ')
    return get_correctness(original, attempt)


def full_correctness(original, attempt):
    original = original.rstrip('.').split(' ')
    attempt = attempt.rstrip('.').split(' ')
    return get_correctness(original, attempt)

In [14]:
def check_score(filename):
    diacritically_correct = 0
    fully_correct = 0
    total = 0
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.rstrip('.\n')
            normalized = normalize(line)
            attempt = restore_diactrics(normalized, bigrams, normalized_bigrams, unigrams)
            total += len(normalized.split(' '))
            diacritically_correct += diacritic_correctness(line, attempt)
            fully_correct += full_correctness(line, attempt)
    score = sqrt(diacritically_correct / total * fully_correct / total)
    print(score)

In [15]:
check_score('./data/test_set.txt')

0.9340711392609874
