In [32]:
import pandas as pd
import plotly.express as px

data = pd.read_csv("../dataset/output.csv")
poems_by_author = data.groupby(data["label"]).count().reset_index()
poems_by_author.columns = ["author", "text"]

px.pie(poems_by_author, values="text", names="author")

In [2]:
data["text_length"] = data["text"].str.split().str.len()

In [None]:
px.histogram(data["text_length"], color=data["label"])

In [4]:
data = data[data["text_length"] < 1000]

In [5]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
import string


nltk.download("popular")
nltk.download("punkt")
nltk.download("punkt_tab")

stop_words = set(stopwords.words("russian"))
stop_words

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/infirriar/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/infirriar/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/infirriar/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/infirriar/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/infirriar/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/infirriar/nltk_data...
[nltk_data]    |   Package movie_review

{'а',
 'без',
 'более',
 'больше',
 'будет',
 'будто',
 'бы',
 'был',
 'была',
 'были',
 'было',
 'быть',
 'в',
 'вам',
 'вас',
 'вдруг',
 'ведь',
 'во',
 'вот',
 'впрочем',
 'все',
 'всегда',
 'всего',
 'всех',
 'всю',
 'вы',
 'где',
 'да',
 'даже',
 'два',
 'для',
 'до',
 'другой',
 'его',
 'ее',
 'ей',
 'ему',
 'если',
 'есть',
 'еще',
 'ж',
 'же',
 'за',
 'зачем',
 'здесь',
 'и',
 'из',
 'или',
 'им',
 'иногда',
 'их',
 'к',
 'как',
 'какая',
 'какой',
 'когда',
 'конечно',
 'кто',
 'куда',
 'ли',
 'лучше',
 'между',
 'меня',
 'мне',
 'много',
 'может',
 'можно',
 'мой',
 'моя',
 'мы',
 'на',
 'над',
 'надо',
 'наконец',
 'нас',
 'не',
 'него',
 'нее',
 'ней',
 'нельзя',
 'нет',
 'ни',
 'нибудь',
 'никогда',
 'ним',
 'них',
 'ничего',
 'но',
 'ну',
 'о',
 'об',
 'один',
 'он',
 'она',
 'они',
 'опять',
 'от',
 'перед',
 'по',
 'под',
 'после',
 'потом',
 'потому',
 'почти',
 'при',
 'про',
 'раз',
 'разве',
 'с',
 'сам',
 'свою',
 'себе',
 'себя',
 'сейчас',
 'со',
 'совсем',
 'так

In [6]:
from collections import Counter
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
stemmer = SnowballStemmer("russian")
tokenizer = RegexpTokenizer(r'\w+')
lemmas = {
    "blok": Counter(),
    "cvetaeva": Counter(),
    "pasternak": Counter()
}

def normalize(text: str, label: str):
    global lemmas
    tokens = tokenizer.tokenize(text.lower())
    tokens = [token for token in tokens if token not in string.punctuation]
    
    for token in tokens:
        if token in stop_words:
            continue
        
        normal_form = morph.parse(token)[0].normal_form
        lemmas[label][normal_form] += 1
        
for index, row in data.iterrows():
    normalize(row["text"], row["label"])


pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.



In [17]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=3, cols=1)

def convert_to_df(author_word_frew: dict):
    word_counts_df = pd.DataFrame({
        "word": author_word_frew.keys(),
        "count": author_word_frew.values()
    })
    return word_counts_df

word_counts_blok = convert_to_df(lemmas["blok"]).sort_values(by="count", key=lambda x: -x)
word_counts_pasternak = convert_to_df(lemmas["pasternak"]).sort_values(by="count", key=lambda x: -x)
word_counts_cvetaeva = convert_to_df(lemmas["cvetaeva"]).sort_values(by="count", key=lambda x: -x)


blok = px.histogram(word_counts_blok, x="word", y="count", title="blok")
pasternak = px.histogram(word_counts_pasternak, x="word", y="count", title="pasternak")
cvetaeva = px.histogram(word_counts_cvetaeva, x="word", y="count", title="cvetaeva")


In [20]:
blok.show()
cvetaeva.show()
pasternak.show()