In [355]:
import re

def load_data(file):
    res = []
    with open(file) as f:
        for line in f:
            if len(line.strip()) == 0:
                continue
            line = line.rstrip()
            for sentence in re.split('\.;', line):
                res.append(sentence.lower())
    return res

korpusPrus = load_data("../../NLP_Resources/dane_pozytywistyczne/korpus_prusa.txt")
korpusSienkiewicz = load_data("../../NLP_Resources/dane_pozytywistyczne/korpus_sienkiewicza.txt")
korpusOrzeszkowej = load_data("../../NLP_Resources/dane_pozytywistyczne/korpus_orzeszkowej.txt")

In [356]:
import collections 
import numpy as np 

countPrus = collections.defaultdict(int)
countSienkiewicz = collections.defaultdict(int)
countOrzeszkowej = collections.defaultdict(int)

def fill(dictionary, korpus):
    for line in korpus:
        for word in line.split():
            dictionary[word] += 1


In [357]:
fill(countPrus, korpusPrus)
fill(countSienkiewicz, korpusSienkiewicz)
fill(countOrzeszkowej, korpusOrzeszkowej)

prusSum = 0
sienkiewiczSum = 0
orzeszkowaSum = 0

def fillSum(dictionary):
    sumka = 0
    for word in dictionary:
        sumka += dictionary[word]
    return sumka

prusSum = fillSum(countPrus)
sienkiewiczSum = fillSum(countSienkiewicz)
orzeszkowaSum = fillSum(countOrzeszkowej)



In [358]:
sentenceCountPrus = collections.defaultdict(int)
sentenceCountSienkiewicz = collections.defaultdict(int)
sentenceCountOrzeszkowa = collections.defaultdict(int)

def fillSentence(dictionary, korpus):
    sumka = len(korpus)
    for line in korpus:
        dictionary[len(line.split())] += 1
    return sumka    
sumSentencePrus = fillSentence(sentenceCountPrus, korpusPrus)
sumSentenceSienkiewicz = fillSentence(sentenceCountSienkiewicz, korpusSienkiewicz)
sumSentenceOrzeszkowa = fillSentence(sentenceCountOrzeszkowa, korpusOrzeszkowej)


In [359]:
bigramPrus = collections.defaultdict(int)
bigramSienkiewicz = collections.defaultdict(int)
bigramOrzeszkowa = collections.defaultdict(int)

def fillBigram(dictionary, korpus):
    sumka = 0
    for line in korpus:
        for i in range(len(line.split()) - 1):
            w1, w2 = line [i:i+2]
            sumka += 1
            dictionary[w1+'#'+w2] += 1
    return sumka

bigramSumPrus = fillBigram(bigramPrus, korpusPrus)
bigramSumSienkiewicz = fillBigram(bigramSienkiewicz, korpusSienkiewicz)
bigramSumOrzeszkowa = fillBigram(bigramOrzeszkowa, korpusOrzeszkowej)

    

In [424]:
def NB(test):
    tab = [] 
    res = 0
    alfa = 0.01
    for line in test:
        res += np.log(sentenceCountPrus.get(len(line.split()), 1) / sumSentencePrus)
        for i in range(len(line.split()) - 1):
            w1, w2 = line[i:i+2]
            res += 2*np.log(bigramPrus.get(w1+'#'+w2, alfa) / bigramSumPrus)
        for word in line.split(): 
            if countPrus[word] != 0:
                res += np.log(countPrus[word] / prusSum)
            else:
                res += np.log(alfa / (prusSum + alfa*len(countPrus.keys())))
            #res += np.log(countPrus.get(word, 0.01) / prusSum)

    tab.append((res / 4, "P"))
    
    res = 0
    for line in test:
        res += 1 * np.log(sentenceCountOrzeszkowa.get(len(line.split()), 1) / sumSentenceOrzeszkowa)
        for i in range(len(line.split()) - 1):
            w1, w2 = line[i:i+2]
            res += 2*np.log(bigramOrzeszkowa.get(w1+'#'+w2, alfa) / bigramSumOrzeszkowa)
        for word in line.split():
            if countOrzeszkowej[word] != 0:
                res += np.log(countOrzeszkowej[word] / orzeszkowaSum)
            else:
                res += np.log(alfa / (orzeszkowaSum + alfa*len(countOrzeszkowej.keys()) ))
    tab.append((res / 4, "O"))    
    
    res = 0
    for line in test:
        res += np.log(sentenceCountSienkiewicz.get(len(line.split()), 1) / sumSentenceSienkiewicz)
        for i in range(len(line.split()) - 1):
            w1, w2 = line[i:i+2]
            res += 2*np.log(bigramSienkiewicz.get(w1+'#'+w2, alfa) / bigramSumSienkiewicz)
        for word in line.split():
            if countSienkiewicz[word] != 0:
                res += np.log(countSienkiewicz[word] / sienkiewiczSum)
            else:
                res += np.log(alfa / (sienkiewiczSum + alfa*len(countSienkiewicz.keys())))
    tab.append((res / 4, "S"))

    tab = sorted(tab, reverse=True)
    return tab

In [425]:
succesful = 0
tried = 0
for i in [''] + list(range(1,23,2)):
    test = load_data(f"../../NLP_Resources/dane_pozytywistyczne/testy1/test_orzeszkowej{i}.txt")
    res = NB(test)
    value, klasa = res[0]
    print(klasa)
    if klasa != "O":
        print(res)
    else:
        succesful += 1
    tried += 1

O
O
O
O
O
S
[(-5690.021100913766, 'S'), (-5722.049562050602, 'O'), (-5770.096227073962, 'P')]
S
[(-5609.473617001939, 'S'), (-5620.558637407791, 'O'), (-5643.282185956437, 'P')]
S
[(-5648.113809162898, 'S'), (-5673.83735581986, 'O'), (-5742.1969275967285, 'P')]
S
[(-5839.499166511268, 'S'), (-5872.662422996963, 'O'), (-5895.668031038219, 'P')]
O
O
O


In [426]:
for i in list(range(1,55,2)):
    test = load_data(f"../../NLP_Resources/dane_pozytywistyczne/testy1/test_sienkiewicza{i}.txt")
    res = NB(test)
    value, klasa = res[0]
    print(klasa)
    if klasa != "S":
        print(res)
    else:
        succesful += 1
    tried += 1

S
S
S
S
S
S
S
S
P
[(-5248.567025989496, 'P'), (-5334.627861674823, 'S'), (-5415.116921027384, 'O')]
S
P
[(-5257.055136317661, 'P'), (-5349.936919028845, 'S'), (-5423.751704477032, 'O')]
P
[(-5829.1779575433, 'P'), (-5857.093290082364, 'S'), (-5922.015564229545, 'O')]
S
S
P
[(-5112.853426263846, 'P'), (-5195.154600694583, 'S'), (-5215.71345169659, 'O')]
S
P
[(-5093.081312472214, 'P'), (-5120.3288725140455, 'S'), (-5186.289501746098, 'O')]
P
[(-4816.201290002637, 'P'), (-4927.014415061255, 'S'), (-4990.158271855596, 'O')]
S
P
[(-5218.192156827537, 'P'), (-5264.8894095945625, 'S'), (-5308.383827311888, 'O')]
S
P
[(-7044.025758081558, 'P'), (-7087.185217191468, 'S'), (-7119.675058503091, 'O')]
P
[(-4954.161636752762, 'P'), (-4998.370869154292, 'S'), (-5045.050799098045, 'O')]
P
[(-5003.230203377446, 'P'), (-5052.684869662245, 'S'), (-5102.85263683008, 'O')]
S
S
S


In [427]:
for i in list(range(0,42,2)):
    test = load_data(f"../../NLP_Resources/dane_pozytywistyczne/testy1/test_prusa{i}.txt")
    res = NB(test)
    value, klasa = res[0]
    print(klasa)
    if klasa != "P":
        print(res)
    else:
        succesful += 1
    tried += 1

P
P
P
P
P
P
P
P
P
P
P
P
P
P
P
P
P
P
P
P
P


In [428]:
print("skutecznosc: ", succesful / tried * 100)

skutecznosc:  76.66666666666667
