In [None]:
import typing

In [None]:
from pathlib import Path, PurePath
import re

import pandas as pd
import nltk

nltk.download("stopwords")

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from pymorphy2 import MorphAnalyzer

In [None]:
DATASET_RAW_PATH = "./assets/data/raw-data/labeled.csv"
DATASET_DIR_ANNOTATEO_PATH = "./assets/data/annotaed-tsv/"
LABEL_X = "comment"
LABEL_Y = "toxic"
LOG = True

In [None]:
def clear_documents(collection: pd.DataFrame | pd.Series) -> pd.DataFrame | pd.Series:
    collection.replace(r"([^ А-ЯЁа-яё])|(\d)", " ", regex=True, inplace=True)
    collection.replace(r"\s{2,}"," ", regex=True, inplace=True)
    collection.replace(r"(^ )|( $)","", regex=True, inplace=True)
    return collection


stopwords_list = stopwords.words("russian")
morph_analyzer = MorphAnalyzer()
stemmer = SnowballStemmer("russian")


def preprocess_document(phrase: str, do: typing.Callable) -> str:
    phrase = " ".join([
        do(word)
        for word in phrase.split()
        if phrase not in stopwords_list
    ])
    return phrase


def preprocess_documents(documents: str | typing.Iterable[str], do: typing.Callable) -> typing.Iterable[str]:
    if isinstance(documents, str):
        ret = preprocess_document(documents, do)

    elif isinstance(documents, typing.Iterable):
        for i in range(len(documents)):
            documents[i] = preprocess_document(documents[i], do)
        ret = documents

    else:
        raise TypeError("documents must be str or Iterable[str]")

    return ret

def norm_documents(documents: str | typing.Iterable[str], *, morph_analyzer=morph_analyzer) -> typing.Iterable[str]:
    def do(word: str) -> str:
        return morph_analyzer.normal_forms(word)[0]
    
    return preprocess_documents(documents, do)


def stem_documents(documents: str | typing.Iterable[str], *, stemmer=stemmer) -> typing.Iterable[str]:
    def do(word: str) -> str:
        return stemmer.stem(word)
    
    return preprocess_documents(documents, do)

In [None]:
def load_data(data_path: str, extract_method: typing.Callable = pd.read_csv, **pandas_kwargs: dict[str, any]) -> typing.Collection:
    df = extract_method(data_path, **pandas_kwargs)
    if LOG:
        print("Data loaded!   Shape is:", df.shape)
        print(df.head(), "\n\n")
    return df

In [None]:
def prepare_annotation_dirs(root_path: str, labels: typing.Collection[str]) -> None:
    root = PurePath(root_path)
    for c in labels:
        Path(root.joinpath(c)).mkdir(parents=True, exist_ok=True)

def save_document_annotation(root_path: str, documents: typing.Iterable[typing.Iterable[str]] | pd.DataFrame, class_label: str, doc_id: int) -> None:
    root = PurePath(root_path)
    file_path = root.joinpath(class_label).joinpath("{:6d}.tsv".format(doc_id))
    documents.to_csv(file_path, sep="\t", index=False)
    
    if LOG:
        print("Document with id", doc_id, "has saved")

def make_annotations_dataframe(document: str, token_extractor_pattern: re.Pattern, lemmatize: typing.Callable[[str], str], stemmatize: typing.Callable[[str], str], stopwords_list: typing.Collection[str] = stopwords) -> pd.DataFrame:    
    words = []

    EMPTY_LINE = ("", "", "")

    sentence_end_ch = [".", "?", "!", "\n"]
    endline_ch = ["\n"]
    is_last_symb_sentence_end = False
    for token in token_extractor_pattern.finditer(document):
        token = token.group().strip(" ")
        
        if token and token not in stopwords_list:          
            if token[-1] in sentence_end_ch:
                is_last_symb_sentence_end = True
            elif is_last_symb_sentence_end:
                is_last_symb_sentence_end = False
                words.append(EMPTY_LINE)
            
            if token not in endline_ch:
                words.append((token, stemmatize(token), lemmatize(token)))


    if words[-1] != EMPTY_LINE:
        words.append(EMPTY_LINE)
    
    ret = pd.DataFrame(data=words, columns=["token", "stem", "lemma"])
    return ret


def process_data_annotation_pipeline(data: pd.DataFrame, root_path: str, labels: list[str] = ["non-toxic", "toxic"], language: str = "russian") -> None:
    stopwords_list = stopwords.words(language)
    morph_analyzer = MorphAnalyzer()
    stemmer = SnowballStemmer(language)

    lemmatize = lambda token: morph_analyzer.normal_forms(token)[0]
    stemmatize = lambda token: stemmer.stem(token)

    phone_number_regex = r"(\+\d{1,3})?\s?\(?\d{1,4}\)?[\s.-]?\d{3}[\s.-]?\d{4}"
    email_regex = r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}"
    word_regex = r"([А-Яа-яЁёA-Za-z0-9-]+)"
    sent_end_combinations = r"(!+\?+)|(\?+!+)|(\.{2,})|(\?{2,})|(!{2,})"
    punct = r"([,\.;:\?!\n-])"

    token_pattern = re.compile("|".join([
        email_regex,
        phone_number_regex,
        word_regex,
        sent_end_combinations,
        punct,
    ]))

    prepare_annotation_dirs(root_path, labels)

    for idx, doc in data.iterrows():
        annotation_table = make_annotations_dataframe(doc[LABEL_X], token_extractor_pattern=token_pattern, lemmatize=lemmatize, stemmatize=stemmatize, stopwords_list=stopwords_list)
        save_document_annotation(root_path, annotation_table, labels[int(doc[LABEL_Y])], idx)


In [None]:
text_documents = load_data(DATASET_RAW_PATH)
process_data_annotation_pipeline(text_documents, DATASET_DIR_ANNOTATEO_PATH)