# Creating the dataset of stanzas

Creates files `all_poems.csv` with all used poems, `all_stanzas.csv` with all used quatrains, annotated with rhyme scheme and meter, and splits `all_stanzas.csv` to test, train and pretrain parts. Also this notebook creates stress, rhyme and stress_rhyme markups.

## Downloading poetree dataset

In [None]:
%pip install poetree

In [None]:
import poetree
import pandas as pd
from tqdm import tqdm

In [None]:
corpus = poetree.Corpus('ru')
print('number of authors:', corpus.n_authors)
print('number of poems:', corpus.n_poems)

number of authors: 371
number of poems: 45563


In [None]:
corpus.get_authors()
df_ = corpus.metadata(target='authors', output='pandas')
df_

Unnamed: 0,id_,name,viaf,wiki,country,born,died,n_poems,corpus
0,81,Ablesimov A.A.,22565877,Q2997224,,1742.0,1783.0,12,ru
1,52,Adalis A.E.,4778618,Q4057234,,1900.0,1969.0,132,ru
2,9,Adamovich G.V.,54329901,Q2623542,,1892.0,1972.0,174,ru
3,4,Ahmatova A.A.,49220707,Q80440,,1889.0,1966.0,945,ru
4,358,Almazov B.N.,60059868,Q4062730,,1827.0,1876.0,11,ru
...,...,...,...,...,...,...,...,...,...
366,126,Zhukov V.M.,,,,1764.0,1799.0,1,ru
367,97,Zhukova A.S.,,,,,1799.0,1,ru
368,347,Zhukovskij V.A.,5050093,Q157623,,1783.0,1852.0,673,ru
369,138,Zhulev G.N.,100144648463231364487,Q4181616,,1836.0,1878.0,7,ru


In [None]:
poems, authors = [], []
i = 0
for author in tqdm(corpus.get_authors()):
    for poem in author.get_poems():
        try:
            poems.append(list(pd.DataFrame(poem.get_body())['text'].values))
            authors.append(author.name)
        except:
            continue
        if len(poems) >= 1000:
            df = pd.DataFrame({'poems': poems, 'authors': authors})
            df.to_csv(f'poems{i}.csv')
            poems, authors = [], []
            i += 1
df = pd.DataFrame({'poems': poems, 'authors': authors})
df.to_csv(f'poems{i}.csv')
df

In [None]:
dfs = []
for j in range(i):
    dfs.append(pd.read_csv(f'poems{j}.csv'))
df = pd.concat(dfs)
df.to_csv('all_poems.csv')

In [None]:
df

Unnamed: 0,poems,authors
0,"['Лисица и Колдун в лесу сошлися', 'И так, как...",Ablesimov A.A.
1,"['Читал печатное, не помню, где-то я,', 'А пов...",Ablesimov A.A.
2,"['Приказчик в деревнях, иль в доме управитель,...",Ablesimov A.A.
3,"['Подьячий здесь зарыт, нашел который клад,', ...",Ablesimov A.A.
4,"['При реке', 'Невдалеке', 'От той дубровы,', '...",Ablesimov A.A.
...,...,...
47277,"['Кто знатен и силен,', 'Да не умен,', 'Так ху...",Krylov I.A.
47278,"['По улицам Слона водили,', 'Как видно напоказ...",Krylov I.A.
47279,"['Почто, мой друг, кричишь ты так на страсти',...",Krylov I.A.
47280,"['Стрелок весной малиновку убил.', 'Уж пусть б...",Krylov I.A.


## Selection of quatrains

In [None]:
%pip install rhymetagger
%apt-get install espeak -y

In [None]:
from rhymetagger import RhymeTagger
import nltk
import pandas as pd
from tqdm import tqdm
import ast

nltk.download('punkt_tab')

rt = RhymeTagger()
rt.load_model(model='ru')  # Load the Russian rhyme model

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Model loaded with following settings:
  frequency_min: 3
           lang: ru
       max_iter: 20
          ngram: 3
   ngram_length: 3
   prob_ipa_min: 0.9
 prob_ngram_min: 0.9
     same_words: False
   stanza_limit: True
         stress: True
       syll_max: 2
    t_score_min: 3.078
   vowel_length: True
         window: 5


In [None]:
schemes = {
    (0, 0, 1, 1): 'AABB',
    (0, 1, 0, 1): 'ABAB',
    (0, 1, 1, 0): 'ABBA',
}

# function for selection of stanzas from poem
def get_stanzas(lines):
    rhymes = rt.tag(lines, output_format=3)
    stanzas = []
    rh_schemes = []
    start = 0
    while start < len(rhymes) - 3:
        stanza = rhymes[start:start + 4]
        if None in stanza:
            start += 1
            continue
        min_d = min(stanza)
        stanza_norm = tuple(map(lambda x: x-min_d, stanza))
        if stanza_norm in schemes:
            stanzas.append(lines[start:start + 4])
            rh_schemes.append(schemes[stanza_norm])
            start += 4
        else:
            start += 1
    return stanzas, rh_schemes

In [None]:
all_df = pd.read_csv('all_poems.csv')
stanzas, rh_schemes = [], []
i = 0
indexes = []
for index, poem in tqdm(enumerate(all_df['poems'])):
    poem = ast.literal_eval(poem)
    st, rh = get_stanzas(poem)
    stanzas.extend(st)
    rh_schemes.extend(rh)
    indexes.extend([index] * len(st))
    if len(stanzas) > 1000:
        df = pd.DataFrame({'stanzas': stanzas, 'rhyme_scheme': rh_schemes, 'index_poem': indexes})
        df.to_csv(f'stanzas{i}.csv')
        print(i)
        i += 1
        stanzas, rh_schemes = [], []
        indexes = []

df = pd.DataFrame({'stanzas': stanzas, 'rhyme_scheme': rh_schemes, 'index_poem': indexes})
df.to_csv(f'stanzas{i}.csv')
df

In [None]:
dfs = []
for j in range(i):
    dfs.append(pd.read_csv(f'stanzas{j}.csv'))
df = pd.concat(dfs).reset_index(drop=True)[['stanzas', 'rhyme_scheme', 'index_poem']]
df['index_stanzas'] = df.index
df = df.sample(frac=1.).reset_index(drop=True)
df.to_csv('all_stanzas.csv', index=False)
df

Finding lens of quatrains to determine the minimun and maximun lens of prosa samples for test dataset.

In [None]:
df['lens'] = [len('\n'.join(x)) for x in df['stanzas']]

In [None]:
df['lens'].describe()

Unnamed: 0,lens
count,332.0
mean,112.186747
std,29.656056
min,30.0
25%,87.0
50%,112.0
75%,137.0
max,184.0


## Annotation of poems with meter

For this section you need the code from prose-to-poetry directory so use:

In [None]:
%cd ../prose-to-poetry

In [None]:
import ast
import pandas as pd
from tqdm import tqdm
from metrics import get_meter

Also you need `all_poems.csv` and `all_stanzas.csv` files.

In [None]:
all_poems = pd.read_csv('all_poems.csv')
all_poems['meter'] = None

In [None]:
for index, poem in tqdm(all_poems[['poems']].iterrows()):
    poem = '\n'.join(ast.literal_eval(poem['poems']))
    m = get_meter(poem) 
    all_poems.loc[index, 'meter'] = m 
  
all_poems   

In [None]:
all_poems.to_csv('all_poems.csv')

Filling stanzas dataset with meter.

In [None]:
all_df = pd.read_csv('../dataset/all_stanzas.csv')
all_df.join(all_poems[['meter']], on='index_poem')

## Statistic of the stanzas dataset

In [None]:
df = pd.read_csv('all_stanzas.csv')
df['rhyme_scheme'].value_counts()

Unnamed: 0_level_0,count
rhyme_scheme,Unnamed: 1_level_1
ABAB,112206
AABB,44918
ABBA,25779


In [None]:
df['meter'].value_counts()

Unnamed: 0_level_0,count
meter,Unnamed: 1_level_1
iambos,118315
choreios,28635
dolnik3,13789
amphibrachys,8607
anapaistos,7905
daktylos,5168
dolnik2,484


## Creating the train, test and pretrain subset

In [None]:
import pandas as pd

df = pd.read_csv('all_stanzas.csv')
df

Unnamed: 0,index_stanzas,stanzas,rhyme_scheme,index_poem,meter
0,105822,"['Бегите жесткого стечения согласных', 'И мере...",AABB,28320,iambos
1,106490,"['И пал раскат их, аки тлен;', 'Румянцев громы...",ABBA,28417,iambos
2,174563,"['Ведь после мучительных вычислений', 'В итоге...",ABAB,44239,dolnik3
3,102275,"['Пока под снегом дремлют зерна,', 'Земля до п...",ABBA,27989,iambos
4,53763,"['И счастие еще, коль, голос он подавши', 'И г...",AABB,16100,iambos
...,...,...,...,...,...
182898,42626,"['Но ты красою ненаглядной', 'Еще томиться мне...",ABAB,13024,iambos
182899,63631,"['В палящий зной, в песке сыпучем по колени,',...",ABBA,18837,iambos
182900,100982,"['Сливаясь в густых облаках', 'В немолчное, гр...",ABAB,27629,amphibrachys
182901,52553,"['А у него мечта одна --', 'Убить огромного сл...",AABB,15749,iambos


In [None]:
df.iloc[:1000].to_csv(f'testset.csv')
df.iloc[1000:11000].to_csv(f'trainset.csv')

In [None]:
df = df.iloc[11000:] 
n = 5
sample_len = len(df) // n
for i in range(n):
    if i == n - 1:
        print(i * sample_len, len(df))
        df.iloc[i * sample_len:].to_csv(f'trainset_pretrain{i}.csv')
        continue
    print(i * sample_len, (i + 1) * sample_len)
    df.iloc[i * sample_len:(i + 1) * sample_len].to_csv(f'trainset_pretrain{i}.csv')
print(len(df))

0 34380
34380 68760
68760 103140
103140 137520
137520 171903
171903


## Create stress markup

For stress markup I used RussianPoetryScansionTool. This section creates column `stress_markup` with stress markup and cleanes stanzas from special simbols.

In [None]:
%git clone https://github.com/Koziev/RussianPoetryScansionTool
%cd RussianPoetryScansionTool
%pip install .
%cd ..

In [None]:
import pandas as pd
import ast
from tqdm.auto import tqdm
import re

import russian_scansion

rpst = russian_scansion.create_rpst_instance('/kaggle/input/poetrystantiontool/models')
rpst.max_words_per_line = 100

VOWELS = "аеёиоуыэюяАЕЁИОУЫЭЮЯ"
STRESS_PRIMARY = '\u0301'  # acute
STRESS_SECONDARY = '\u0300'  # grave

  self.model.load_state_dict(torch.load(model_filepath, map_location=self.device))


In [None]:
def strip_diacritics(text):
    return text.replace(STRESS_PRIMARY, '').replace(STRESS_SECONDARY, '')

def find_stress_positions(word):
    if not any(ch in VOWELS for ch in word):
        return [], [], word, 0

    primary = []
    secondary = []

    index = 0
    for i, ch in enumerate(word):
        if STRESS_PRIMARY == ch:
            primary.append(index)
        elif STRESS_SECONDARY == ch:
            secondary.append(index)
        elif ch in VOWELS:
            index += 1

    clean_word = strip_diacritics(word)
    return primary, secondary, clean_word, index

def annotate_text_with_stress(text, add_count):
    '''For one line'''
    result = []
    max_i = 0

    for token in text.split(' '):
        if not token.strip():
            result.append(token)
            continue

        subwords = token.split('-')
        res_subwords = []

        for sub in subwords:
            primary, secondary, clean, count = find_stress_positions(sub)

            if len(primary) > 2 or len(secondary) > 2:
                print(f"Слово '{sub}' имеет >2 ударения одного типа: primary={primary}, secondary={secondary}")
                
            markers = [f"<S{i}>" for i in sorted(primary)]
            markers += [f"<s{i}>" for i in sorted(secondary)]
            if add_count and count != 0:
                markers += [f"<count{count}>"]
            max_i = max(max_i, count)

            if len(markers) == 0:
                res_subwords.append(clean)
            else:
                res_subwords.append(' '.join([''.join(markers), clean]))

        result.append(' - '.join(res_subwords))

    return ' '.join(result), max_i

def get_fotratted_stressed_lines(line):
    scansion = rpst.align(line)
    lines = scansion.get_stressed_lines(show_secondary_accentuation=True).split('\n')
    result = []
    all_max_i = 0
    for line in lines:
        res_line, max_i = annotate_text_with_stress(line, True)
        result.append(res_line)
        all_max_i = max(all_max_i, max_i)
    return result, all_max_i

def get_dataset_stress_markup(df):
    result = []
    all_max_i = 0
    for i in tqdm(range(len((df)))):
        row = df.iloc[i]
        stanzas = ast.literal_eval(row['stanzas'])
        markup, max_i = get_fotratted_stressed_lines(stanzas)
        all_max_i = max(max_i, all_max_i)
        result.append(markup)
    print(f'maximun stressed i = {all_max_i}')
    return result

In [None]:
def clean_text(text):
    text = re.sub(r"[^\w\s.,;:!?\"'«»()\[\]\-–—…]", "", text)
    return text

In [None]:
df = pd.read_csv('testset.csv', index_col=0)
df['stanzas'] = df['stanzas'].apply(clean_text)
df['stress_markup'] = get_dataset_stress_markup(df)
df.to_csv('testset.csv')
df

In [None]:
df = pd.read_csv('trainset.csv', index_col=0)
df['stanzas'] = df['stanzas'].apply(clean_text)
df['stress_markup'] = get_dataset_stress_markup(df)
df.to_csv('trainset.csv')
df  

In [None]:
for i in range(5):
    df = pd.read_csv(f'/kaggle/input/project-poetry-dataset/trainset_pretrain{i}.csv', index_col=0)
    df['stanzas'] = df['stanzas'].apply(clean_text)
    df['stress_markup'] = get_dataset_stress_markup(df)  
    df.to_csv(f'trainset_pretrain{i}.csv')     
df        

## Adding rhyme markup

This section adds rhyme and stress_rhyme markups in columns `rhyme_markup` and `rhyme_stress_markup`.

In [None]:
%pip install pyphen

In [None]:
import nltk
import pyphen
import ast
import re
import string
import pandas as pd

nltk.download('punkt')
nltk.download('punkt_tab')

dic = pyphen.Pyphen(lang='ru')

def extract_rhyme_key(word, min_syllables=2, use_all=False):
    if use_all:
        return word.lower()
    syllables = dic.inserted(word).split('-')
    if len(syllables) >= min_syllables:
        return ''.join(syllables[-min_syllables:]).lower()
    else:
        return word.lower()

def remove_punctuation(text):
    return re.sub(rf"[{re.escape(string.punctuation)}«»—…]", "", text)

def rhyme_markup(lines, rhyme_scheme, use_all):
    """
    lines: list[str]
    rhyme_scheme: str
    use_all: bool - if true uses all last word and not the last the syllables
    return: list[str] — lines with rhyme markup
    """
    result = []
    for i, line in enumerate(lines):
        clean_line = remove_punctuation(line)
        words = nltk.word_tokenize(clean_line, language="russian")
        if not words:
            print('!!! not nltk words')
            continue

        last_word = words[-1]
        rhyme_key = extract_rhyme_key(last_word, use_all=use_all)
        rhyme_label = rhyme_scheme[i % len(rhyme_scheme)]

        tag = f"<rhyme{rhyme_label}>{rhyme_key}</rhyme{rhyme_label}> {line}"
        result.append(tag)
    return result

def get_dataset_rhyme_markup(df, use_all=False, col='stanzas'):
    result = []
    for i, row in df.iterrows():
        stanzas = ast.literal_eval(row[col])
        rhyme_scheme = row['rhyme_scheme']
        markup = rhyme_markup(stanzas, rhyme_scheme, use_all)
        result.append(markup)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
df = pd.read_csv('testset.csv', index_col=0)
df['rhyme_stress_markup'] = get_dataset_rhyme_markup(df, col='stress_markup')
df['rhyme_markup'] = get_dataset_rhyme_markup(df)
df.to_csv('testset.csv')
df

In [None]:
df = pd.read_csv('trainset.csv', index_col=0)
df['rhyme_stress_markup'] = get_dataset_rhyme_markup(df, col='stress_markup')
df['rhyme_markup'] = get_dataset_rhyme_markup(df)
df.to_csv('trainset.csv')
df

In [None]:
for i in range(5):
    df = pd.read_csv(f'trainset_pretrain{i}.csv', index_col=0)
    df['rhyme_stress_markup'] = get_dataset_rhyme_markup(df, col='stress_markup')
    df['rhyme_markup'] = get_dataset_rhyme_markup(df)
    df.to_csv(f'trainset_pretrain{i}.csv')
df