## Validation

In [4]:
from naive_bayes_v2 import naive_bayes, AuthorshipClass
from file_utils import count_words_in_file, count_sentence_lengths_in_file, get_sentences_from_text

orzeszkowa_suffix = 'korpus_orzeszkowej.txt'
prus_suffix = 'korpus_prusa.txt'
sienkiewicz_suffix = 'korpus_sienkiewicza.txt'

teaching_set_prefix = './dane/teaching'
orzeszkowa_teaching_file = f'{teaching_set_prefix}_{orzeszkowa_suffix}'
prus_teaching_file = f'{teaching_set_prefix}_{prus_suffix}'
sienkiewicz_teaching_file = f'{teaching_set_prefix}_{sienkiewicz_suffix}'

orzeszkowa_words = count_words_in_file(orzeszkowa_teaching_file)
orzeszkowa_lengths = count_sentence_lengths_in_file(orzeszkowa_teaching_file)
prus_words = count_words_in_file(prus_teaching_file)
prus_lengths = count_sentence_lengths_in_file(prus_teaching_file)
sienkiewicz_words = count_words_in_file(sienkiewicz_teaching_file)
sienkiewicz_lengths = count_sentence_lengths_in_file(sienkiewicz_teaching_file)

orzeszkowa = AuthorshipClass('o', orzeszkowa_words, orzeszkowa_lengths)
prus = AuthorshipClass('p', prus_words, prus_lengths)
sienkiewicz = AuthorshipClass('s', sienkiewicz_words, sienkiewicz_lengths)


In [5]:
def validate(expected_symbol, filename, mult):
    with open(filename, 'r', encoding='utf-8') as f:
        sentences = get_sentences_from_text(f.read())

    total = 0
    correct = 0
    for sentence in sentences:
        words = sentence.split()
        res = naive_bayes(words, [orzeszkowa, prus, sienkiewicz], mult)
        if res[0][0] == expected_symbol:
            correct += 1
        total += 1
    return correct, total


def validate_orzeszkowa(mult):
    correct, total = validate(orzeszkowa.symbol, f'./dane/validation_{orzeszkowa_suffix}', mult)
    return correct / total


def validate_prus(mult):
    correct, total = validate(prus.symbol, f'./dane/validation_{prus_suffix}', mult)
    return correct / total


def validate_sienkiewicz(mult):
    correct, total = validate(sienkiewicz.symbol, f'./dane/validation_{sienkiewicz_suffix}', mult)
    return correct / total

#### Testing multipliers

In [7]:
mult = lambda _: 1.0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.4711033274956217
prus 0.4949685534591195
sienkiewicz 0.7849462365591398


In [9]:
mult = lambda _: 5.0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.4366608289550496
prus 0.5276729559748428
sienkiewicz 0.7526881720430108


In [8]:
mult = lambda _: 10.0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.4127262113251605
prus 0.5641509433962264
sienkiewicz 0.6469534050179212


In [32]:
mult = lambda words: len(words)

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.46351430239346175
prus 0.5886792452830188
sienkiewicz 0.5824372759856631


In [27]:
mult = lambda words: 1.5 * len(words)

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.4349095154699358
prus 0.6050314465408805
sienkiewicz 0.507168458781362


In [81]:
mult = lambda words: 10 if len(words) > 20 else 1

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.47752481027437244
prus 0.5056603773584906
sienkiewicz 0.7634408602150538


In [82]:
mult = lambda words: 10 if len(words) > 15 else 0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.50379451255108
prus 0.5144654088050314
sienkiewicz 0.7329749103942652


In [83]:
mult = lambda words: 10 if len(words) > 10 else 0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.5078809106830122
prus 0.5364779874213836
sienkiewicz 0.6792114695340502


In [84]:
mult = lambda words: 40 if len(words) > 10 else 0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.500875656742557
prus 0.5547169811320755
sienkiewicz 0.5376344086021505


In [85]:
mult = lambda words: len(words) * 0.5 if len(words) > 15 else 1

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.4868651488616462
prus 0.5125786163522013
sienkiewicz 0.7329749103942652


## Tests

In [47]:
def test_authorship(filenames, expected_symbol, mult):
    correct = 0
    total = 0
    for filename in filenames:
            c, t = validate(expected_symbol, filename, mult)
            # print(c / t)
            correct += c
            total += t
    if total != 0:
        print(f'Total stats: {correct / total}')

In [48]:
def test_orzeszkowa(mult):
    filenames = [
        f'./dane/testy1/test_orzeszkowej{i}.txt' for i in range(1, 22, 2)]
    test_authorship(filenames, orzeszkowa.symbol, mult)

In [49]:
def test_prus(mult):
    filenames = [f'./dane/testy1/test_prusa{i}.txt' for i in range(0, 40, 2)]
    test_authorship(filenames, prus.symbol, mult)

In [50]:
def test_sienkiewicz(mult):
    filenames = [
        f'./dane/testy1/test_sienkiewicza{i}.txt' for i in range(1, 53, 2)]
    test_authorship(filenames, sienkiewicz.symbol, mult)

In [51]:
mult = lambda _: 1.0

test_orzeszkowa(mult)
print()
test_prus(mult)
print()
test_sienkiewicz(mult)

Total stats: 0.2777777777777778

Total stats: 0.45378973105134474

Total stats: 0.6395161290322581


#### Najlepszy wynik?

In [54]:
mult = lambda words: len(words)

test_orzeszkowa(mult)
print()
test_prus(mult)
print()
test_sienkiewicz(mult)

Total stats: 0.3242894056847545

Total stats: 0.5662591687041565

Total stats: 0.5112903225806451


In [66]:
mult = lambda words: 10 if len(words) > 10 else 0

test_orzeszkowa(mult)
print()
test_prus(mult)
print()
test_sienkiewicz(mult)

Total stats: 0.32041343669250644

Total stats: 0.5105134474327628

Total stats: 0.5665322580645161
