# Подготовка

In [None]:
import pandas as pd
from pathlib import Path
import docx
from pypdf import PdfReader
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import pymorphy3

In [None]:
additional_stopwords = {"а",
"без",
"безо",
"в",
"во",
"для",
"до",
"за",
"из",
"изо",
"к",
"ко",
"на",
"над",
"надо",
"о",
"об",
"обо",
"от",
"ото",
"по",
"под",
"подо",
"при",
"про",
"с",
"со",
"у",
"через",
"чрез",
"и",
"тоже",
"также",
"но",
"однако",
"зато",
"же",
"или",
"либо",
"то",
"что",
"чтобы",
"как",
"будто",
"когда",
"пока",
"едва",
"потому",
"так",
"ибо",
"оттого",
"чтобы",
"если",
"бы",
"раз",
"коли",
"хотя",
"хоть",
"пускай",
"как",
"будто",
"словно",
"точно",
"давай",
"давайте",
"пусть",
"пускай",
"бы",
"б",
"же",
"даже",
"именно",
"только",
"лишь",
"хоть",
"исключительно",
"единственно",
"просто",
"прямо",
"вот",
"вон",
"это",
"ли",
"ль",
"разве",
"неужели",
"не",
"ни",
"так",
"точно",
"конечно",
"едва",
"только",
"всего",
"исключительно",
"где",
"куда",
"откуда",
"когда",
"зачем",
"почему",
"отчего",
"как",
"сколько",
"насколько",
"что",
"кто",
"какой",
"каков",
"который",
"чей"}

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Часть 1

In [None]:


## Часть 1

def load_csv(path: Path, sep: str = ";") -> pd.DataFrame:
    df = pd.read_csv(path, sep=sep)
    return df

def load_txt(path: Path, sep: str = "SEP") -> pd.DataFrame:
    text = path.read_text()
    parts = text.split(sep)
    parts = [p.strip() for p in parts if p]
    return pd.Series(parts, dtype="string").reset_index(drop=True)

def load_docx(path: Path, sep: str = "SEP") -> pd.Series:
    doc = docx.Document(path)
    text = ''.join(paragraph.text for paragraph in doc.paragraphs)
    parts = [p.strip() for p in text.split(sep)]
    return pd.Series(parts, dtype="string").reset_index(drop=True)

def load_pdf(path: Path, sep="SEP") -> pd.Series:
    reader = PdfReader(path)
    text = "".join(page.extract_text() for page in reader.pages)
    parts = [p.strip() for p in text.split(sep)]
    return pd.Series(parts, dtype="string").reset_index(drop=True)



def load(path: Path) -> pd.DataFrame | pd.Series:
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    match path.suffix:
        case ".csv":
            return load_csv(path)
        case ".txt":
            return load_txt(path)
        case ".docx":
            return load_docx(path)
        case ".pdf":
            return load_pdf(path)
        case _:
            raise ValueError
        

# Часть 2

In [None]:
def _clean(text: str) -> str:
    text = text.lower()
    text, _ = re.subn('[0-9]',"", text)
    text, _ = re.subn('[,.;:]', '', text)
    text, _ = re.subn(' +', ' ', text)
    return text

def clean(text_series: pd.Series) -> pd.Series:
    result = text_series.copy()
    result = result.apply(_clean)
    return result


# Часть 3

In [None]:

stemmer = SnowballStemmer('russian')

# russian_stopwords = stopwords.words('russian')
all_stopwords = set( additional_stopwords)
morph = pymorphy3.MorphAnalyzer()


def flatten(x):
    for item in x:
        if isinstance(item, (list, tuple)):
            yield from flatten(item)
        else:
            yield item

def _token_word(text):
    return word_tokenize(text, language="russian")
def _token_sent(text):
    return sent_tokenize(text, language="russian")


def get_tokens_sent(series: pd.Series) -> pd.Series:
    series_parts = flatten([_token_sent(row) for row in series])
    return pd.Series(series_parts, dtype="string").reset_index(drop=True)


def get_tokens_word(series: pd.Series) -> pd.Series:
    series_parts = flatten([_token_word(row) for row in series])
    return pd.Series(series_parts, dtype="string").reset_index(drop=True)

def stem_text(tokens):
    return pd.Series([stemmer.stem(word) for word in tokens], dtype='string').reset_index(drop=True)


def lemmatize_text(tokens):
    lemmas = []
    for word in tokens:
        # Анализ слова и выбор наиболее вероятной формы
        parsed = morph.parse(word)[0]
        lemmas.append(parsed.normal_form)
    return pd.Series(lemmas, dtype="string").reset_index(drop=True)


def delete_stop(series: pd.Series) -> pd.Series:
    result = []
    for token in series:
        if not token in all_stopwords: 
            result.append(token)
    
    return pd.Series(result, dtype="string").reset_index(drop=True)

def _tokenise(series: pd.Series) -> pd.Series:
    result = get_tokens_word(series)
    result = delete_stop(result)
    result = stem_text(result)
    result = lemmatize_text(result)
    return result

In [None]:
path = "news_5k.txt"

In [None]:
path = Path(path)


nltk.download('punkt')
nltk.download('punkt_tab')
# nltk.download('stopwords')

serega = load(path)
print(serega)
sergei = clean(serega)
print(sergei)
sergei_sergeevich = _tokenise(sergei)
print(sergei_sergeevich)


In [None]:
def statistics(path):
    data = load(path)
    data = clean(data)
    print("-"*40)
    print(data.info())
    print("-"*40)
    print(data.describe())

    tokens_word = get_tokens_word(data)
    print("-"*40)

    print(tokens_word.info())
    print("-"*40)
    print(tokens_word.describe())
    tokens_word = tokens_word.apply(len)
    print("-"*40)

    print(tokens_word.info())
    print("-"*40)
    print(tokens_word.describe())
    tokens_sent = get_tokens_sent(data)
    print("-"*40)

    print(tokens_sent.info())
    print("-"*40)
    print(tokens_sent.describe())
    tokens_sent = tokens_sent.apply(len)

    print("-"*40)

    print(tokens_sent.info())
    print("-"*40)
    print(tokens_sent.describe())

In [None]:

p1 = "news_5k.txt"
p2 = "news_5k.docx"
p3 = "news_5k.pdf"
p4 = "news_5k.csv"

In [None]:
statistics(p1)

In [None]:
statistics(p2)

In [None]:
statistics(p3)

In [None]:
statistics(p4)