In [1]:
from collections import Counter
from nltk.stem.snowball import RussianStemmer

import json
import re

In [2]:
def cleaning(word: str) -> bool:
    if not word:
        return False
    if re.match(r'.*\d+.*', word):
        return False
    return True

def get_words(text: str):
    uncleaned = re.findall(r'[Нн]а самом деле|[Вв] общем|\w+-\w+|\w+', text)
    cleaned = filter(cleaning, uncleaned)
    normalized = map(lambda word: word.lower(), cleaned)
    yo_deleted = map(lambda word: word.replace('ё', 'е'), normalized)
    stemmer = RussianStemmer()
    return map(lambda word: stemmer.stem(word), yo_deleted)

In [3]:
with open('texts.json', 'r') as f:
    texts = json.load(f)

In [4]:
corpus_words = Counter()
total_documents = 0

In [5]:
for text in texts:
    author, message = text['author'], text['message']
    if author and message:
        # Here we generate corpus
        words = Counter(set(get_words(message)))
        if words.total():
            total_documents += 1
            corpus_words += words

In [6]:
total_documents

459

In [7]:
key_words = corpus_words.keys()

In [8]:
def filter_keys(word: str) -> bool:
    if corpus_words[word] / total_documents >= 0.8 or corpus_words[word] <= 15:
        return False
    return True

filtered_keys = filter(filter_keys, key_words)

In [9]:
with open('corpus.json', 'w') as f:
    to_write = {
        'documents_number':  total_documents,
        'key_words': list(filtered_keys),
        'corpus': corpus_words
    }
    json.dump(to_write, f, indent=4, ensure_ascii=False)