In [403]:
import numpy as np 
from collections import defaultdict
import random
import statistics as st
import itertools as iter

learningSet = []
validationSet = []

with open('../../NLP_Resources/polish_corpora.txt') as f:
    lineCounter = 0
    learningSetSize = 1000000
    validationSetSize = 200000
    for line in f:
        if lineCounter < learningSetSize:
            learningSet.append(line)
        else:
            validationSet.append(line)

        lineCounter += 1
        if lineCounter > (learningSetSize + validationSetSize):
            break



In [404]:
def deletePolishCharacters(word):
    polishToEnglish = {'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n', 'ó': 'o', 'ś': 's', 'ż': 'z', 'ź': 'z'}

    word = word.lower()

    resultWord = ''
    
    for character in word:
        resultWord += polishToEnglish.get(character, character)

    return resultWord

In [522]:
#Setting up most of the data structures that I am going to need

wordUpperVsLower = defaultdict(lambda: [0,0]) # upper, lower
cleanReverseOptions = defaultdict(lambda: defaultdict(int)) # dict of words -> probability
bigramUpperVsLower = defaultdict(lambda: [0,0]) # upper, lower
bigramSuffix = defaultdict(int)

for line in learningSet:
    line = line.split()
    wordIndexInLine = 0
    
    #unigram statistics
    for word in line:
        wordCleaned = deletePolishCharacters(word)

        if wordCleaned == 'sie':
            cleanReverseOptions[wordCleaned]['się'] +=1
        else:
            cleanReverseOptions[wordCleaned][word.lower()] += 1
        

        #capital letters
        if word[0].islower():
            wordUpperVsLower[wordCleaned][1] +=1
        if word[0].isupper() and wordIndexInLine != 0:
            wordUpperVsLower[wordCleaned][0] +=1


        wordIndexInLine += 1
        
    #bigram statistics
    for i in range(len(line)-1):
        if i == 0:
            continue

        word1 = line[i]
        word2 = line[i+1]

        hash = word1.lower()[-3:] + '#' + word2.lower()[-3:]

        bigramSuffix[hash] +=1
        
        word1Cleaned = deletePolishCharacters(word1)
        word2Cleaned = deletePolishCharacters(word2)
        
        hash = word1Cleaned + '#' + word2Cleaned

        if word1[0].isupper() and word2[0].isupper():
            bigramUpperVsLower[hash][0] += 1
        else:
            bigramUpperVsLower[hash][1] += 1





In [523]:
for key in wordUpperVsLower:
    a, b = wordUpperVsLower[key]
    if a < b and a / b < 0.07:
        wordUpperVsLower[key][0] = 0

for key in cleanReverseOptions:
    options = []
    for word in cleanReverseOptions[key]:
        if cleanReverseOptions[key][word] > 0:
            options.append((cleanReverseOptions[key][word], word))
    options = sorted(options, reverse=True)
    if len(options) == 0:
        continue
    maks = options[0][0]
    for option in options:
        if option[0]/maks < 0.03:
            cleanReverseOptions[key].pop(option[1])




In [535]:
with open('../../NLP_Resources/poleval_2grams.txt') as f:
    for line in f:
        v, w1, w2 = line.split()
        w1Cleaned = deletePolishCharacters(w1)
        w2Cleaned = deletePolishCharacters(w2)

        sz = len(cleanReverseOptions[w1Cleaned])
        if sz == 0:
            cleanReverseOptions[w1Cleaned][w1] = 1
        if sz <= 3:
            temp = 0
            for key in cleanReverseOptions[w1Cleaned]:
                    temp += cleanReverseOptions[w1Cleaned][key]
            if temp == 0:
                cleanReverseOptions[w1Cleaned][w1] = 1
        
        sz = len(cleanReverseOptions[w2Cleaned])
        if sz == 0:
            cleanReverseOptions[w2Cleaned][w2] = 1
        if sz <= 3:
            temp = 0
            for key in cleanReverseOptions[w2Cleaned]:
                    temp += cleanReverseOptions[w2Cleaned][key]
            if temp == 0:
                cleanReverseOptions[w2Cleaned][w2] = 1


In [536]:
def selectWordFromUnigrams(wordCleaned):
    potentialWords = []
    probabilities = []
    for candidate in cleanReverseOptions[wordCleaned]:
        potentialWords.append(candidate)
        probabilities.append(cleanReverseOptions[wordCleaned][candidate])

    if len(potentialWords) < 1:
        return wordCleaned
    probabilities = np.array(probabilities)
    potentialWords = np.array(potentialWords)
    sum = np.sum(probabilities)

    selectedWord = np.random.choice(potentialWords, p= probabilities/sum)
    return selectedWord

def capitalizeUnigram(selectedWord):
    probs = np.array(wordUpperVsLower.get(selectedWord, [0,0]))
    options = np.array([True, False])
    sum = np.sum(probs)
    if sum == 0:
        return False
    decision = np.random.choice(options, p = probs/sum)
    return decision
    
def selectWordsFromBigrams(word1Cleaned, word2Cleaned):
    options1, options2 = ([],[])
    
    for word in cleanReverseOptions[word1Cleaned]:
        options1.append(word)
    
    for word in cleanReverseOptions[word2Cleaned]:
        options2.append(word)

    if len(cleanReverseOptions[word2Cleaned]) == 0:
        options2.append(word2Cleaned)

    resultCandidates = []
    
    for word1 in options1:
        for word2 in options2:
            h = word1[-3:] + '#' + word2[-3:]
            resultCandidates.append((bigramSuffix[h], (word1, word2)))
    
    resultCandidates = sorted(resultCandidates, reverse = True)

    if len(resultCandidates) == 0:
        return ('-1', '-1')
        #return (word1Cleaned, word2Cleaned)

    return resultCandidates[0][1]

    

In [544]:

score = 0
wordCount = 0
correctWordsCount = 0
correctWordsCountWithCapitalization = 0

for line in validationSet:
    line = line.split()
    resultLine = []
    skip = False

    for i in range(len(line)):
        word = line[i]
        wordCleaned = deletePolishCharacters(word)
        if i == 0:
            selectedWord = selectWordFromUnigrams(wordCleaned)
        else:
            previousWord = deletePolishCharacters(resultLine[i-1])
            word2 = wordCleaned

            selectedWord = selectWordsFromBigrams(previousWord, word2)[1]
            if selectedWord == '-1':
                selectedWord = selectWordFromUnigrams(wordCleaned)

            # print(selectedWord)
        
        resultLine.append(selectedWord)


    #Handle capital letters
    for i in range(len(line)):
        if skip:
            skip = False
            continue

        word1 = resultLine[i]
        word1Cleaned = deletePolishCharacters(word1)
        if i != len(line)-1:
            #handle bigrams
            word2 = resultLine[i+1]

            hash = word1Cleaned + '#' + deletePolishCharacters(word2)

            if bigramUpperVsLower[hash][1] > 0 and bigramUpperVsLower[hash][1] == 0:
                resultLine[i] = resultLine[i].capitalize()
                resultLine[i+1] = resultLine[i+1].capitalize()
                skip = True
                continue

        if capitalizeUnigram(word1Cleaned):
            resultLine[i] = resultLine[i].capitalize()

        if i == 0:
            resultLine[i] = resultLine[i].capitalize()

   
    wordCount += len(line)

    wrote = False
    for i in range(len(line)):

        if line[i].lower() == resultLine[i].lower():
            correctWordsCount +=1
            if line[i][0] == resultLine[i][0]:
                correctWordsCountWithCapitalization += 1

dokladnoscPolskawa = correctWordsCount / wordCount
dokladnoscPelna = correctWordsCountWithCapitalization / wordCount
score = np.sqrt(dokladnoscPelna * dokladnoscPolskawa)

In [545]:
print(score)

0.9577681695088869
