In [None]:
import pandas as pd
import plotly.express as px

data = pd.read_csv("../dataset/output.csv")
poems_by_author = data.groupby(data["label"]).count().reset_index()
poems_by_author.columns = ["author", "text"]

px.pie(poems_by_author, values="text", names="author")

In [None]:
data["text_length"] = data["text"].str.split().str.len()

In [None]:
px.histogram(data["text_length"], color=data["label"])

In [None]:
data = data[data["text_length"] < 1000]

In [None]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
import string


nltk.download("popular")
nltk.download("punkt")
nltk.download("punkt_tab")

stop_words = set(stopwords.words("russian"))
stop_words

In [None]:
from collections import Counter
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
stemmer = SnowballStemmer("russian")
tokenizer = RegexpTokenizer(r'\w+')
lemmas = {
    "blok": Counter(),
    "cvetaeva": Counter(),
    "pasternak": Counter()
}

def normalize(text: str, label: str):
    global lemmas
    tokens = tokenizer.tokenize(text.lower())
    tokens = [token for token in tokens if token not in string.punctuation]
    
    for token in tokens:
        if token in stop_words:
            continue
        
        normal_form = morph.parse(token)[0].normal_form
        lemmas[label][normal_form] += 1
        
for index, row in data.iterrows():
    normalize(row["text"], row["label"])

In [None]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=3, cols=1)

def convert_to_df(author_word_frew: dict):
    word_counts_df = pd.DataFrame({
        "word": author_word_frew.keys(),
        "count": author_word_frew.values()
    })
    return word_counts_df

word_counts_blok = convert_to_df(lemmas["blok"]).sort_values(by="count", key=lambda x: -x)
word_counts_pasternak = convert_to_df(lemmas["pasternak"]).sort_values(by="count", key=lambda x: -x)
word_counts_cvetaeva = convert_to_df(lemmas["cvetaeva"]).sort_values(by="count", key=lambda x: -x)


blok = px.histogram(word_counts_blok, x="word", y="count", title="blok")
pasternak = px.histogram(word_counts_pasternak, x="word", y="count", title="pasternak")
cvetaeva = px.histogram(word_counts_cvetaeva, x="word", y="count", title="cvetaeva")


In [None]:
blok.show()
cvetaeva.show()
pasternak.show()