<a href="https://colab.research.google.com/github/MrsIgnis/MOCI/blob/main/MOCI_task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [248]:
!pip install nltk pymorphy3



In [249]:
import nltk
import pymorphy3
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [250]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [251]:
stop_words_ru = set(stopwords.words("russian"))
stop_words_en = set(stopwords.words("english"))

In [253]:
with open('/content/test.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [254]:
stemmer_ru = SnowballStemmer("russian")
stemmer_en = SnowballStemmer("english")
lemma_ru = pymorphy3.MorphAnalyzer()

In [255]:
def preprocess_text(text: str) -> tuple[list[str], list[str]]:
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]

    lemmas = [lemma_ru.parse(word)[0].normal_form for word in words if word not in stop_words_ru]
    stems = [stemmer_ru.stem(word) for word in words if word not in stop_words_ru]

    return lemmas, stems

In [256]:
def tokenize_ascii(text: str) -> list[str]:
    return [char for char in text if ord(char) <= 255]

In [257]:
def vectorize_ascii(text: list[str]) -> list[int]:
    return [ord(char) for char in text if ord(char) <= 255]

In [258]:
lemmas, stems = preprocess_text(text)
processed_text = ' '.join(lemmas + stems)
ascii_tokens = tokenize_ascii(processed_text)
ascii_vectors = vectorize_ascii(processed_text)

In [263]:
print("Лемматизированный текст:", lemmas, '\n')
print("Стеммированный текст:", stems, '\n')
print("Токенизированный текст:", ascii_tokens, '\n')
print("Векторизированный текст:", ascii_vectors)

Лемматизированный текст: ['скромняга', 'бард', 'отдыхать', 'дело', 'геральт', 'ривия', 'песня', 'петь', 'when', 'a', 'humble', 'bard', 'graced', 'a', 'ride', 'along', 'with', 'geralt', 'of', 'rivia', 'along', 'came', 'this', 'song'] 

Стеммированный текст: ['скромняг', 'бард', 'отдыха', 'дел', 'геральт', 'рив', 'песн', 'пел', 'when', 'a', 'humble', 'bard', 'graced', 'a', 'ride', 'along', 'with', 'geralt', 'of', 'rivia', 'along', 'came', 'this', 'song'] 

Токенизированный текст: [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'w', 'h', 'e', 'n', ' ', 'a', ' ', 'h', 'u', 'm', 'b', 'l', 'e', ' ', 'b', 'a', 'r', 'd', ' ', 'g', 'r', 'a', 'c', 'e', 'd', ' ', 'a', ' ', 'r', 'i', 'd', 'e', ' ', 'a', 'l', 'o', 'n', 'g', ' ', 'w', 'i', 't', 'h', ' ', 'g', 'e', 'r', 'a', 'l', 't', ' ', 'o', 'f', ' ', 'r', 'i', 'v', 'i', 'a', ' ', 'a', 'l', 'o', 'n', 'g', ' ', 'c', 'a', 'm', 'e', ' ', 't', 'h', 'i', 's', ' ', 's', 'o', 'n', 'g', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'w', 'h', 'e', 'n', ' ', 'a', '