## Validation

In [1]:
from naive_bayes_v2 import naive_bayes, AuthorshipClass
from file_utils import count_words_in_file, count_sentence_lengths_in_file, get_sentences_from_text

orzeszkowa_suffix = 'korpus_orzeszkowej.txt'
prus_suffix = 'korpus_prusa.txt'
sienkiewicz_suffix = 'korpus_sienkiewicza.txt'

teaching_set_prefix = './dane/teaching'
orzeszkowa_teaching_file = f'{teaching_set_prefix}_{orzeszkowa_suffix}'
prus_teaching_file = f'{teaching_set_prefix}_{prus_suffix}'
sienkiewicz_teaching_file = f'{teaching_set_prefix}_{sienkiewicz_suffix}'

orzeszkowa_words = count_words_in_file(orzeszkowa_teaching_file)
orzeszkowa_lengths = count_sentence_lengths_in_file(orzeszkowa_teaching_file)
prus_words = count_words_in_file(prus_teaching_file)
prus_lengths = count_sentence_lengths_in_file(prus_teaching_file)
sienkiewicz_words = count_words_in_file(sienkiewicz_teaching_file)
sienkiewicz_lengths = count_sentence_lengths_in_file(sienkiewicz_teaching_file)

orzeszkowa = AuthorshipClass('o', orzeszkowa_words, orzeszkowa_lengths)
prus = AuthorshipClass('p', prus_words, prus_lengths)
sienkiewicz = AuthorshipClass('s', sienkiewicz_words, sienkiewicz_lengths)


In [6]:
def validate(expected_symbol, filename):
    with open(filename, 'r', encoding='utf-8') as f:
        sentences = get_sentences_from_text(f.read())

    total = 0
    correct = 0
    for sentence in sentences:
        words = sentence.split()
        res = naive_bayes(words, [orzeszkowa, prus, sienkiewicz])
        if res[0][0] == expected_symbol:
            correct += 1
        total += 1
    return correct, total


def validate_orzeszkowa():
    correct, total = validate(orzeszkowa.symbol, f'./dane/validation_{orzeszkowa_suffix}')
    return correct / total


def validate_prus():
    correct, total = validate(prus.symbol, f'./dane/validation_{prus_suffix}')
    return correct / total


def validate_sienkiewicz():
    correct, total = validate(sienkiewicz.symbol, f'./dane/validation_{sienkiewicz_suffix}')
    return correct / total

In [7]:
print('orzeszkowa')
print(validate_orzeszkowa())

orzeszkowa
0.4711033274956217


In [8]:
print('prus')
print(validate_prus())

prus
0.4949685534591195


In [9]:
print('sienkiewicz')
print(validate_sienkiewicz())

sienkiewicz
0.7849462365591398


## Tests

In [33]:
def test_authorship(filenames, expected_symbol):
    correct = 0
    total = 0
    for filename in filenames:
            c, t = validate(expected_symbol, filename)
            print(c / t)
            correct += c
            total += t
    if total != 0:
        print(f'Total stats: {correct / total}')

In [34]:
def test_orzeszkowa():
    filenames = [
        f'./dane/testy1/test_orzeszkowej{i}.txt' for i in range(1, 22, 2)]
    test_authorship(filenames, orzeszkowa.symbol)

In [35]:
test_orzeszkowa()

0.27380952380952384
0.3548387096774194
0.19834710743801653
0.2727272727272727
0.2033898305084746
0.2
0.1896551724137931
0.061224489795918366
0.42424242424242425
0.48333333333333334
0.32786885245901637
Total stats: 0.2777777777777778


In [38]:
def test_prus():
    filenames = [f'./dane/testy1/test_prusa{i}.txt' for i in range(0, 40, 2)]
    test_authorship(filenames, prus.symbol)

In [39]:
test_prus()

0.313953488372093
0.4482758620689655
0.4578313253012048
0.42424242424242425
0.45161290322580644
0.40476190476190477
0.5283018867924528
0.46551724137931033
0.43795620437956206
0.35374149659863946
0.5092592592592593
0.3146067415730337
0.5535714285714286
0.5633802816901409
0.49137931034482757
0.44954128440366975
0.4609375
0.5042016806722689
0.4752475247524752
0.5277777777777778
Total stats: 0.45378973105134474


In [42]:
def test_sienkiewicz():
    filenames = [
        f'./dane/testy1/test_sienkiewicza{i}.txt' for i in range(1, 53, 2)]
    test_authorship(filenames, sienkiewicz.symbol)

In [43]:
test_sienkiewicz()

0.7611940298507462
0.75
0.7073170731707317
0.691358024691358
0.7333333333333333
0.6554621848739496
0.6862745098039216
0.6142857142857143
0.6923076923076923
0.5867768595041323
0.6446280991735537
0.6643356643356644
0.7362637362637363
0.7922077922077922
0.40404040404040403
0.4696969696969697
0.66
0.6830985915492958
0.5813953488372093
0.4470588235294118
0.6170212765957447
0.5114503816793893
0.6
0.6132075471698113
0.5614035087719298
0.7719298245614035
Total stats: 0.6395161290322581
