## Validation

In [1]:
from naive_bayes_v2 import naive_bayes, AuthorshipClass
from file_utils import count_words_in_file, count_sentence_lengths_in_file, get_sentences_from_text

orzeszkowa_suffix = 'korpus_orzeszkowej.txt'
prus_suffix = 'korpus_prusa.txt'
sienkiewicz_suffix = 'korpus_sienkiewicza.txt'

teaching_set_prefix = './dane/teaching'
orzeszkowa_teaching_file = f'{teaching_set_prefix}_{orzeszkowa_suffix}'
prus_teaching_file = f'{teaching_set_prefix}_{prus_suffix}'
sienkiewicz_teaching_file = f'{teaching_set_prefix}_{sienkiewicz_suffix}'

orzeszkowa_words = count_words_in_file(orzeszkowa_teaching_file)
orzeszkowa_lengths = count_sentence_lengths_in_file(orzeszkowa_teaching_file)
prus_words = count_words_in_file(prus_teaching_file)
prus_lengths = count_sentence_lengths_in_file(prus_teaching_file)
sienkiewicz_words = count_words_in_file(sienkiewicz_teaching_file)
sienkiewicz_lengths = count_sentence_lengths_in_file(sienkiewicz_teaching_file)

orzeszkowa = AuthorshipClass('o', orzeszkowa_words, orzeszkowa_lengths)
prus = AuthorshipClass('p', prus_words, prus_lengths)
sienkiewicz = AuthorshipClass('s', sienkiewicz_words, sienkiewicz_lengths)


In [2]:
def validate(expected_symbol, filename, mult):
    with open(filename, 'r', encoding='utf-8') as f:
        sentences = get_sentences_from_text(f.read())

    total = 0
    correct = 0
    for sentence in sentences:
        words = sentence.split()
        res = naive_bayes(words, [orzeszkowa, prus, sienkiewicz], mult)
        if res[0][0] == expected_symbol:
            correct += 1
        total += 1
    return correct, total


def validate_orzeszkowa(mult):
    correct, total = validate(orzeszkowa.symbol, f'./dane/validation_{orzeszkowa_suffix}', mult)
    return correct / total


def validate_prus(mult):
    correct, total = validate(prus.symbol, f'./dane/validation_{prus_suffix}', mult)
    return correct / total


def validate_sienkiewicz(mult):
    correct, total = validate(sienkiewicz.symbol, f'./dane/validation_{sienkiewicz_suffix}', mult)
    return correct / total

#### Testing multipliers

In [4]:
mult = lambda _: 1.0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.5134488942020323
prus 0.5309734513274337
sienkiewicz 0.795751633986928


In [5]:
mult = lambda _: 5.0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.5002988643156007
prus 0.5699115044247788
sienkiewicz 0.7483660130718954


In [6]:
mult = lambda _: 10.0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.44172145845786015
prus 0.6489675516224189
sienkiewicz 0.6797385620915033


In [7]:
mult = lambda words: len(words)

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.4255827854154214
prus 0.5309734513274337
sienkiewicz 0.7843137254901961


In [8]:
mult = lambda words: 1.5 * len(words)

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.3723849372384937
prus 0.5356932153392331
sienkiewicz 0.7663398692810458


In [9]:
mult = lambda words: 10 if len(words) > 20 else 1

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.5218170950388523
prus 0.51976401179941
sienkiewicz 0.7941176470588235


In [10]:
mult = lambda words: 10 if len(words) > 15 else 0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.5260011954572624
prus 0.5073746312684366
sienkiewicz 0.8022875816993464


In [11]:
mult = lambda words: 10 if len(words) > 10 else 0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.4967124925283921
prus 0.5073746312684366
sienkiewicz 0.8120915032679739


In [84]:
mult = lambda words: 40 if len(words) > 10 else 0

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.500875656742557
prus 0.5547169811320755
sienkiewicz 0.5376344086021505


In [12]:
mult = lambda words: len(words) * 0.5 if len(words) > 15 else 1

print('orzeszkowa', validate_orzeszkowa(mult))
print('prus', validate_prus(mult))
print('sienkiewicz', validate_sienkiewicz(mult))

orzeszkowa 0.5152420800956365
prus 0.5067846607669616
sienkiewicz 0.8022875816993464


## Tests

In [27]:
def test_authorship(filenames, expected_symbol, mult):
    correct = 0
    total = 0
    for filename in filenames:
            c, t = validate(expected_symbol, filename, mult)
            # print(c / t)
            correct += c
            total += t
    if total != 0:
        print(f'{expected_symbol} stats: {correct / total}')

In [15]:
def test_orzeszkowa(mult):
    filenames = [
        f'./dane/testy1/test_orzeszkowej{i}.txt' for i in range(1, 22, 2)]
    test_authorship(filenames, orzeszkowa.symbol, mult)

In [16]:
def test_prus(mult):
    filenames = [f'./dane/testy1/test_prusa{i}.txt' for i in range(0, 40, 2)]
    test_authorship(filenames, prus.symbol, mult)

In [17]:
def test_sienkiewicz(mult):
    filenames = [
        f'./dane/testy1/test_sienkiewicza{i}.txt' for i in range(1, 53, 2)]
    test_authorship(filenames, sienkiewicz.symbol, mult)

#### Najlepszy wynik?

In [28]:
mult = lambda words: len(words) * 0.5 if len(words) > 15 else 1
test_orzeszkowa(mult)
test_prus(mult)
test_sienkiewicz(mult)

o stats: 0.30103359173126615
p stats: 0.4308068459657702
s stats: 0.6479838709677419
