In [6]:
from poetguesser import poetguesser 

---
### Load input document
> print initial parts

In [7]:
with open('testdoc.txt') as f:
    doc = f.read()
print(doc[0:209])

Ve Valencii Cid meškal
znaven prací, tíhou běd,
znaven válek lopocením,
vždyť jich tolik různých sved.

Když jej právě došly zprávy,
Bucar, velký Maurský král,
ku Valencii že táhne,
z toho vznik mu trud i žal.


---
### Analyze input document with UDPipe and Ingram + print summary

In [8]:
pg = poetguesser.PoetGuesser()
pg.analyze_doc(doc)
pg.doc_summary()

# of lines  : 623
# of tokens : 3424
  meters    : {'t8': 416, 't7': 195, 't9': 9, 't6': 3}


---
### Check available candidates for the most frequent meter

In [9]:
pg.available_candidates(sample_size=200, meter='t8')

# of candidates : 34
# of samples    : {'Chalupa, František_Q12016955': 15, 'Crha, Václav Antonín_Q11985379': 13, 'Dostál-Lutinov, Karel_Q12028170': 13, 'Dvořák, Xaver_Q5626105': 10, 'Machar, Josef Svatopluk_Q713271': 50, 'Furch, Vincenc_Q11901668': 15, 'Heyduk, Adolf_Q362154': 69, 'Hněvkovský, Šebestián_Q391373': 10, 'Kalus, Josef_Q12026190': 17, 'Klášterský, Antonín_Q326446': 17, 'Kulda, Beneš Metod_Q4246529': 19, 'Leger, Karel_Q12028367': 14, 'Lošťák, Ludvík_Q12034244': 20, 'Pokorný, Rudolf_Q12050236': 15, 'Nejedlý, Vojtěch_Q3500252': 45, 'Zeyer, Julius_Q942980': 11, 'Čech, Svatopluk_Q745363': 20, 'Sládek, Josef Václav_Q1705960': 19, 'Vrchlický, Jaroslav_Q461104': 152, 'Nečas, Jan Evangelista_Q12022502': 11, 'Paroubek, Otakar G._Q12043175': 21, 'Picek, Václav Jaromír_Q15831353': 11, 'Procházka, František Serafínský_Q12017483': 29, 'Tablic, Bohuslav_Q833623': 11, 'Rubeš, František Jaromír_Q3507724': 15, 'Vinařický, Karel Alois_Q4111467': 13, 'Stašek, Antal_Q3564937': 24, 'Táborský, F

---
### Perform single classification 
> with equal number of samples per author (fixed set)

In [10]:
verdict = pg.guess_author(
    sample_size   = 200, 
    meter         = 't8',
    v_ngram       = 3,
    w_ngram       = 1,
    candidate_set = ['Machar, Josef Svatopluk_Q713271', 'Heyduk, Adolf_Q362154', 'Vrchlický, Jaroslav_Q461104', 'Nejedlý, Vojtěch_Q3500252', 'Wenzig, Josef_Q1706029'],
    level_authors = True,
    random_seed   = 42,
    zscores       = True,
)
print(f'Predicted author: {verdict}')

⬇️ Downloading candidate dataset...
✅ Downloading complete
Predicted author: Vrchlický, Jaroslav_Q461104


> with a custom classifier (Random Forest)

> with equal number of samples per author (fixed set)

In [11]:
from sklearn.ensemble import RandomForestClassifier
verdict = pg.guess_author(
    sample_size       = 200, 
    meter             = 't8',
    v_ngram           = 3,
    w_ngram           = 1,
    candidate_set     = ['Machar, Josef Svatopluk_Q713271', 'Heyduk, Adolf_Q362154', 'Vrchlický, Jaroslav_Q461104', 'Nejedlý, Vojtěch_Q3500252', 'Wenzig, Josef_Q1706029'],
    level_authors     = True,
    random_seed       = 42,
    zscores           = True,
    custom_classifier = RandomForestClassifier(n_estimators=100, random_state=42),
)
print(f'Predicted author (Random Forest): {verdict}')

Predicted author (Random Forest): Vrchlický, Jaroslav_Q461104


> with all available candidates

> with equal number of samples per author (fixed set)

In [12]:
verdict = pg.guess_author(
    sample_size   = 200, 
    meter         = 't8',
    v_ngram       = 3,
    w_ngram       = 1,
    candidate_set = [],
    level_authors = True,
    random_seed   = 42,
    zscores       = True,
)
print(f'Predicted author: {verdict}')

Predicted author: Vrchlický, Jaroslav_Q461104


---
### Perform 1000 classifications 
> with equal number of samples per author (picked by random in each iteration)

In [13]:
candidate_set = ['Machar, Josef Svatopluk_Q713271', 'Heyduk, Adolf_Q362154', 'Vrchlický, Jaroslav_Q461104', 'Nejedlý, Vojtěch_Q3500252', 'Wenzig, Josef_Q1706029']
verdicts = {x:0 for x in candidate_set}
for i in range(1000):
    print(f'iteration: {i+1}', end='\r')
    verdict = pg.guess_author(
        sample_size   = 200, 
        meter         = 't8',
        v_ngram       = 3,
        w_ngram       = 1,
        candidate_set = ['Machar, Josef Svatopluk_Q713271', 'Heyduk, Adolf_Q362154', 'Vrchlický, Jaroslav_Q461104', 'Nejedlý, Vojtěch_Q3500252', 'Wenzig, Josef_Q1706029'],
        level_authors = True,
        random_seed   = None,
        zscores       = True,
    )
    verdicts[verdict] += 1
print('PREDICTIONS:     \n------------')
max_len = len(max(candidate_set, key=len))
for a in verdicts:
    print(f'{a.ljust(max_len)} : {verdicts[a]}')


PREDICTIONS:     
------------
Machar, Josef Svatopluk_Q713271 : 0
Heyduk, Adolf_Q362154           : 0
Vrchlický, Jaroslav_Q461104     : 1000
Nejedlý, Vojtěch_Q3500252       : 0
Wenzig, Josef_Q1706029          : 0
