# Input text genarators

## WikiStorage - class for working with wiki articles stored in MongoDB

In [4]:
from typing import Generator
import pymongo


class WikiStorage:
    """Class for working with MongoDB"""

    db: pymongo.database.Database
    col: pymongo.collection.Collection

    def __init__(self, db: pymongo.database.Database, col: pymongo.collection.Collection):
        self.db = db
        self.col = col

    @classmethod
    def connect(cls, host: str, port=27017, db_name='wiki', col_name='articles'):
        db = pymongo.MongoClient(host, port, unicode_decode_error_handler='ignore')[db_name]
        return cls(
            db=db,
            col=db[col_name])

    def get_articles(self, count=0) -> Generator:
        return self.col.find({}).limit(count)

    def get_article(self, title) -> dict:
        doc = self.col.find_one({'title': title})
        return doc if doc else {}

    def get_articles_headings_texts(self, count=0) -> list:
        for article in self.get_articles(count):
            yield article['text']['Заголовок']['text']

## Postgres storage - base class for all classes working with PostgreSQL

In [5]:
from typing import Generator
import psycopg2

class PostgresStorage:
    
    conn: psycopg2.extensions.connection
    
    def __init__(self, conn):
        self.conn = conn

    @classmethod
    def connect(cls, 
                host: str, 
                port: int = 5432,
                user: str = 'postgres',
                password: str = 'password',
                dbname: str = 'postgres'):
        return cls(conn=psycopg2.connect(
            host=host, port=port, user=user, password=password, dbname=dbname))

## Habr storage - class for working posts from habr stored in PostgreSQL

In [6]:
class HabrStorage(PostgresStorage):

    def get_posts(self, 
                  count: int = 0, 
                  habs_list: list = None, 
                  tags_list: list = None) -> Generator:
        if not habs_list and not tags_list:
            cursor = self.conn.cursor()
            sql = 'SELECT * FROM posts'
            if count:
                sql += ' LIMIT %d' % count
            cursor.execute(sql)
            return (post for post in cursor.fetchall())
        elif habs_list:
            return self.__get_posts_by_habs(count, habs_list)
        elif tags_list:
            return self.__get_posts_by_tags(count, tags_list)

    def get_posts_texts(self,
                        count: int = 0,
                        habs_list: list = None, 
                        tags_list: list = None) -> Generator:
        posts_texts_gen = (post[2] for post in self.get_posts(count, habs_list, tags_list))
        return posts_texts_gen

    def __get_posts_by_habs(self, 
                            count: int,
                            habs_list: list) -> Generator:
        sql = '''SELECT P.* 
                   FROM posts P JOIN habs H ON P.post_id = H.post_id
                  WHERE H.hab in (%s)''' % ''.join(["'" + str(hab) + "', " for hab in habs_list])[:-2]
        sql = sql + " LIMIT %d" % count if count > 0 else sql
        cursor = self.conn.cursor()
        cursor.execute(sql)
        return (post for post in cursor.fetchall())

    def __get_posts_by_tags(self, 
                            count: int,
                            tags_list: list) -> Generator:
        sql = '''SELECT P.* 
                   FROM posts P JOIN tags T ON P.post_id = T.post_id
                  WHERE T.tag in (%s)''' % ''.join(["'" + str(tag) + "', " for tag in tags_list])[:-2]
        sql = sql + " LIMIT %d" % count if count > 0 else sql
        cursor = self.conn.cursor()
        cursor.execute(sql)
        return (post for post in cursor.fetchall())

## Text processors

In [7]:
from typing import Generator, Iterable
import re
import nltk


class Tokenizer:
    to_sentences = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s')
    remove_brackets = re.compile(r' \((.*?)\)')
    remove_punctuation = re.compile(r'[^a-zA-Zа-яА-Я ]')

    @classmethod
    def tokenize(cls, text: str, remove_punctuation=True, remove_brackets=True) -> Generator:
        buf = text.split('\n')
        buf = (item for item in buf if item)
        sentences = (sentence[:-1].lower().strip()
                     for sentence in cls.to_sentences.split(' '.join(buf))
                     if sentence[:-1])
        if remove_brackets:
            sentences = (cls.remove_brackets.sub('', sentence) for sentence in sentences)
        if remove_punctuation:
            return (cls.remove_punctuation.sub('', sentence) for sentence in sentences)
        return sentences


class TextProcessor:
    tokenizer = Tokenizer()

    @classmethod
    def get_sentences_gens(cls, texts: Iterable, remove_punctuation=True, remove_brackets=True) -> Generator:
        for text in texts:
            yield cls.tokenizer.tokenize(
                text=text,
                remove_punctuation=remove_punctuation,
                remove_brackets=remove_brackets)

    @classmethod
    def get_text_gen(cls, text_gens_gen: Iterable) -> Generator:
        for text_gen in text_gens_gen:
            for sentences_gen in cls.get_sentences_gens(text_gen):
                for sentence in sentences_gen:
                    yield sentence.split()

    @classmethod
    def get_ngram_gen(cls, text_gens_gen: Iterable, ngram_size: int = 3) -> Generator:
        for text_gen in text_gens_gen:
            for sentences_gen in cls.get_sentences_gens(text_gen):
                for sentence in sentences_gen:
                    yield [''.join(item) for item in nltk.ngrams(sentence, ngram_size)]

## Words encoder - class for encoding/decoding words stored as int nums

In [8]:
import json
from typing import Generator
        
class WordsEncoder:
    
    counter: int
    word2int: dict
    int2word: dict
    begin_word: int = 0
    end_word: int = -1
        
    def __init__(self, counter: int = None, word2int: dict = None, int2word: dict = None):
        self.counter = counter
        self.word2int = word2int
        self.int2word = int2word

    def fit(self, text_corpus):
        self.counter = 0
        self.word2int = {
            self.begin_word: self.begin_word,
            self.end_word: self.end_word
        }
        self.int2word = {
            self.begin_word: self.begin_word,
            self.end_word: self.end_word
        }
        for sentence in text_corpus:
            for word in sentence:
                if word not in self.word2int:
                    self.counter += 1
                    self.word2int[word] = self.counter
                    self.int2word[self.counter] = word
                    
    def fit_encode(self, text_corpus) -> list:
        corpus = list(text_corpus) if isinstance(text_corpus, Generator) else text_corpus
        self.fit(corpus)
        return self.encode_text_corpus_gen(corpus)

    def encode_words_list(self, words_list: list) -> list:
        return [self.word2int[word] for word in words_list]

    def encode_text_corpus(self, text_corpus: list) -> list:
        """List of lists of words"""
        return [self.encode_words_list(words_list) for words_list in text_corpus]

    def encode_text_corpus_gen(self, text_corpus_gen: Generator) -> Generator:
        """List of lists of words"""
        return (self.encode_words_list(sentence) for sentence in text_corpus_gen)

    def decode_codes_list(self, codes_list: list) -> list:
        return [self.int2word[code] for code in codes_list]

    def to_dict(self):
        """
        Returns the underlying data as a Python dict.
        """
        return {
            "counter": self.counter,
            "word2int": self.word2int,
            "int2word": self.int2word
        }

    def to_json(self):
        """
        Returns the underlying data as a JSON string.
        """
        return json.dumps(self.to_dict())

    @classmethod
    def from_dict(cls, obj):

        int2word = obj["int2word"]
        for key in int2word:
            int2word[int(key)] = int2word.pop(key)

        int2word[cls.end_word] = cls.end_word
        int2word[cls.begin_word] = cls.begin_word

        word2int = obj["word2int"]
        word2int[cls.end_word] = int(word2int.pop(str(cls.end_word)))
        word2int[cls.begin_word] = int(word2int.pop(str(cls.begin_word)))

        return cls(
            counter=obj["counter"],
            word2int=word2int,
            int2word=int2word
        )

    @classmethod
    def from_json(cls, json_str):
        return cls.from_dict(json.loads(json_str))

## Encoder storage - class for working with words encoder stored in PostgreSQL

In [9]:
class EncoderStorage(PostgresStorage):
    
    model_name: str
    begin_word: int = 0
    end_word: int = -1
        
    def add_encoder(self, model_name: str, encoder: WordsEncoder):
        self.model_name = model_name
        
        cursor = self.conn.cursor()
        cursor.execute('CALL add_encoder(%s)', [model_name])
        self.conn.commit()
        
        for code, word in encoder.int2word.items():
            sql = f'''INSERT INTO {model_name}_encoder(code, word)
                      VALUES (%s, %s)'''
            cursor.execute(sql, [code, word])           
        self.conn.commit()
        self.__create_indexes(model_name)        
            
    def delete_encoder(self, model_name: str):
        cursor = self.conn.cursor()
        cursor.execute('CALL delete_encoder(%s)', [model_name])
        self.conn.commit()
        
    def load_encoder(self, model_name: str) -> WordsEncoder:
        cursor = self.conn.cursor()
        cursor.execute(f'SELECT code, word FROM {model_name}_encoder')
        int2word = {}
        word2int = {}
        for row in cursor.fetchall():
            code, word = row[0], row[1]
            int2word[code] = word
            word2int[word] = code
        word2int[self.end_word] = int(word2int.pop(str(self.end_word)))
        word2int[self.begin_word] = int(word2int.pop(str(self.begin_word)))
        counter = len(int2word) - 2 # except begin and end words
        return WordsEncoder(counter=counter,
                            int2word=int2word,
                            word2int=word2int)
        
    def __create_indexes(self, model_name: str):
        cursor = self.conn.cursor()
        cursor.execute('CALL create_encoder_indexes(%s)', [model_name]);
        self.conn.commit()
        
    def __drop_indexes(self, model_name: str):
        cursor = self.conn.cursor()
        cursor.execute('CALL drop_encoder_indexes(%s)', [model_name]);
        self.conn.commit()

# Chain storage - realization of markov chain stored in PostgreSQL

In [10]:
import random
import operator
import bisect
import json
import copy


def accumulate(iterable, func=operator.add):
    it = iter(iterable)
    total = next(it)
    yield total
    for element in it:
        total = func(total, element)
        yield total

def compile_next(next_dict):
    words = list(next_dict.keys())
    cff = list(accumulate(next_dict.values()))
    return [words, cff]


class ChainStorage(PostgresStorage):
    
    begin_word: int = 0
    end_word: int = -1
        
    def add_model(self, model_name: str, train_corpus: list, state_size: int):
        model_dict = self.__build_model(train_corpus, state_size)
        
        cursor = self.conn.cursor()
        cursor.execute('CALL add_model(%s, %s)', [model_name, self.end_word])
        self.conn.commit()
        
        for state_tuple in model_dict:
            buf = model_dict[state_tuple]
            choices_list, cumdist_list = buf[0], buf[1] 
            self.__add_state(cursor, model_name, state_tuple, choices_list, cumdist_list)            
        self.conn.commit()
        self.__create_index(model_name)
        
        del model_dict
            
    def delete_model(self, model_name: str):
        cursor = self.conn.cursor()
        cursor.execute('CALL delete_model(%s)', [model_name])
        self.conn.commit()
        
    def walk(self, model_name: str, init_state: list, phrase_len: int = 10):
        cursor = self.conn.cursor()
        cursor.execute(f'SELECT chain_walk_{model_name}(%s, %s)', [init_state, phrase_len])
        return cursor.fetchone()[0] or []
    
    def __build_model(self, train_corpus, state_size: int) -> dict:
        model = {}

        for run in train_corpus:
            items = ([ self.begin_word ] * state_size) + run + [ self.end_word ]
            for i in range(len(run) + 1):
                state = tuple(items[i:i+state_size])
                follow = items[i+state_size]
                if state not in model:
                    model[state] = {}

                if follow not in model[state]:
                    model[state][follow] = 0

                model[state][follow] += 1
                
        model = { state: compile_next(next_dict) for (state, next_dict) in model.items() }
        return model
    
    def __add_state(self,
            cursor,
            model_name: str,
            state: tuple, 
            choices: list, 
            cumdist: list):
        sql = f'''INSERT INTO {model_name}(state, choices, cumdist)
                  VALUES (%s, %s, %s)
                  ON CONFLICT DO NOTHING'''
        cursor.execute(sql, [list(state), choices, cumdist])
    
    def __create_index(self, model_name: str, hash_index: bool = True):
        cursor = self.conn.cursor()
        cursor.execute('CALL create_model_table_index(%s, %s)', [model_name, hash_index]);
        self.conn.commit()
        
    def __drop_index(self, model_name: str):
        cursor = self.conn.cursor()
        cursor.execute('CALL drop_model_table_index(%s)', [model_name]);
        self.conn.commit()

## Work example

In [11]:
encoded_corpus = [
    [1, 5, 6],
    [65, 4, 1, 54],
    [5, 65, 1, 324],
    [3, 6, 54]
]

pg_model = ChainStorage.connect('172.17.0.2', dbname='markov')
pg_model.add_model('test_sample', train_corpus=encoded_corpus, state_size=2)
pg_model.walk('test_sample', [0, 1])

OperationalError: FATAL:  password authentication failed for user "postgres"


In [12]:
pg_model.delete_model('test_sample')

NameError: name 'pg_model' is not defined

## Postgres chain usage with text processor & text encoder

In [13]:
pg_habs = HabrStorage.connect('172.17.0.3', dbname='habr')

texts_list = list(pg_habs.get_posts_texts(10))
train_corpus = list(TextProcessor.get_text_gen([texts_list,]))

encoder = WordsEncoder()
encoded_train_corpus = encoder.fit_encode(train_corpus)

pg_model = ChainStorage.connect('172.17.0.2', dbname='markov')
pg_model.add_model('another_test_sample', train_corpus=encoded_train_corpus, state_size=2)

encoder.decode_codes_list(pg_model.walk('another_test_sample', [0, 0]))

OperationalError: could not connect to server: Connection refused
	Is the server running on host "172.17.0.3" and accepting
	TCP/IP connections on port 5432?


In [14]:
pg_model.delete_model('another_test_sample')

NameError: name 'pg_model' is not defined

# Text generator model based on encoded markov chain

In [15]:
import re
from typing import Iterable

class TextGenerator:
    pg_chain: ChainStorage
    pg_encoder: EncoderStorage
    encoder: WordsEncoder
    model_name: str
    state_size: int
    use_ngrams: bool
    ngram_size: int
    re_process: re.Pattern = re.compile(r'[^a-zA-Zа-яА-Я ]')

    def __init__(self,
                 pg_chain: ChainStorage,
                 pg_encoder: EncoderStorage,
                 model_name: str,
                 state_size: int,
                 input_text: Iterable = None,
                 use_ngrams: bool = False,
                 ngram_size: int = 3):
        self.pg_chain = pg_chain
        self.pg_encoder = pg_encoder
        self.model_name = model_name
        self.state_size = state_size
        self.use_ngrams = use_ngrams
        self.ngram_size = ngram_size

        if input_text:
            self.train_model(input_text)
        else:
            self.encoder = self.pg_encoder.load_encoder(model_name)
            
    def train_model(self, input_text: Iterable):
        if self.use_ngrams:
            train_corpus = list(TextProcessor.get_ngram_gen(input_text, self.ngram_size))
        else:
            train_corpus = list(TextProcessor.get_text_gen(input_text))

        self.encoder = WordsEncoder()
        encoded_train_corpus = self.encoder.fit_encode(train_corpus)

        self.pg_encoder.add_encoder(self.model_name, self.encoder)
        self.pg_chain.add_model(self.model_name, encoded_train_corpus, self.state_size)
    
    def delete_model(self):
        self.pg_chain.delete_model(self.model_name)
        self.pg_encoder.delete_encoder(self.model_name)
        
    def ngrams_split(self, sentence: str) -> list:
        processed_sentence = self.re_process.sub('', sentence.lower())
        ngrams_list = [''.join(item) for item in nltk.ngrams(processed_sentence, self.ngram_size)]
        return ngrams_list

    def words_split(self, sentence: str) -> list:
        words_list = []
        for word in sentence.split():
            processed_word = self.re_process.sub('', word.lower())
            if processed_word:
                words_list.append(processed_word)
        return words_list

    def words_join(self, words_list: list) -> str:
        return ' '.join(words_list)

    def ngrams_join(self, ngrams_list: list) -> str:
        return ngrams_list[0][:-1] + ''.join([ngram[-1] for ngram in ngrams_list])

    def make_sentence(self, init_state: list, **kwargs):
        tries = kwargs.get('tries', 10)
        max_words = kwargs.get('max_words', None)
        min_words = kwargs.get('min_words', None)
        print(init_state)
        
        if init_state is not None:
            init_state = self.encoder.encode_words_list(init_state)
            prefix = init_state
            for word in prefix:
                if word == self.encoder.begin_word:
                    prefix = prefix[1:]
                else:
                    break
        else:
            prefix = []
            
        init_state = init_state[-self.state_size:]
        print('prefix:', prefix)
        print('init_state:', init_state)
        for _ in range(tries):
            codes_list = prefix + self.pg_chain.walk(self.model_name, init_state, 1000)
            words_list = self.encoder.decode_codes_list(codes_list)
            print(len(words_list))
            if (max_words is not None and len(words_list) > max_words) or (
                    min_words is not None and len(words_list) < min_words):
                continue
            if self.use_ngrams:
                return self.ngrams_join(words_list)
            return self.words_join(words_list)
        return None

    def make_sentence_with_start(self, input_phrase: str, **kwargs):
        if self.use_ngrams:
            items_list = self.ngrams_split(input_phrase)
        else:
            items_list = self.words_split(input_phrase)
        items_count = len(items_list)

        if items_count == self.state_size:
            init_state = items_list

        elif 0 < items_count:
            init_state = [self.encoder.begin_word] * (self.state_size - items_count) + items_list
        else:
            init_state = [self.encoder.begin_word] * self.state_size

        return self.make_sentence(init_state, **kwargs)

    def make_sentences_for_t9(self, beginning: str, first_words_count=1, count=20) -> list:
        phrases = set()
        for i in range(count):
            phrase = self.make_sentence_with_start(beginning)
            if phrase:
                words_list = phrase.split()
                if len(words_list) > 1:
                    phrases.add(" ".join(words_list[first_words_count:]))
        return list(phrases)


# Work example

In [16]:
def get_text_gen(
        mongo_wiki: WikiStorage,
        pg_habr: HabrStorage,
        wiki_articles_count=1000,
        habr_posts_count=1000,
        **kwargs
):
    habr_posts_gen = pg_habr.get_posts_texts(
        count=habr_posts_count, habs_list=kwargs.get('habs_list'), tags_list=kwargs.get('tags_list'))
    wiki_articles_gen = mongo_wiki.get_articles_headings_texts(count=wiki_articles_count)
    return (text_gen for text_gen in (habr_posts_gen, wiki_articles_gen))

## Establish connections to dbs

In [17]:
mongo_wiki = WikiStorage.connect(host='localhost')
pg_habr = HabrStorage.connect(host='172.17.0.3', dbname='habr')
pg_chain = ChainStorage.connect(host='172.17.0.2', dbname='markov')
pg_encoder = EncoderStorage.connect(host='172.17.0.2', dbname='markov')

OperationalError: could not connect to server: Connection refused
	Is the server running on host "172.17.0.3" and accepting
	TCP/IP connections on port 5432?


## Train model

In [256]:
%%time
train_corpus_gen = get_text_gen(mongo_wiki=mongo_wiki,
                                pg_habr=pg_habr,
                                wiki_articles_count=10,
                                habr_posts_count=6000,
                                habs_list=['Машинное обучение', 'Математика', 'Физика'])
model = TextGenerator(pg_chain=pg_chain,
                      pg_encoder=pg_encoder,
                      model_name='habr',
                      state_size=3,
                      input_text=train_corpus_gen)

CPU times: user 5.63 s, sys: 3.57 s, total: 9.2 s
Wall time: 25.3 s


## Load model

In [63]:
model = TextGenerator(pg_chain=pg_chain,
                      pg_encoder=pg_encoder,
                      model_name='test_model',
                      state_size=3)

In [259]:
model.make_sentences_for_t9('привет хабр', first_words_count=2, count=10)

['',
 'data engineering становится все более и более популярным keras за год догнал torch который разрабатывается уже лет судя по упоминаниям в научных статьях от до измерений причем выбор конкретного значения сводится к объему доступной памяти gpu',
 'представляю вашему вниманию вторую часть статьи о поиске подозреваемых в мошениничестве на основе данных',
 'представляю вашему вниманию перевод статьи everything you need to know about the android market how to get high rating on play store google play store eda',
 'представляю вашему вниманию перевд статьи solving multiarmed bandits a comparison of epsilongreedy and thompson sampling',
 'задача снижения размерности является одной из важнейших в анализе данных',
 'сегодня мы продолжаем нашу классическую серию статей про то как автомобиль поедет в городских условиях']

## Train model on ngrams

In [16]:
%%time
train_corpus_gen = get_text_gen(mongo_wiki=mongo_wiki,
                                pg_habr=pg_habr,
                                wiki_articles_count=100,
                                habr_posts_count=7000,
                                habs_list=['Машинное обучение', 'Математика', 'Физика'])
ngram_model = TextGenerator(pg_chain=pg_chain,
                            pg_encoder=pg_encoder,
                            model_name='ngram_size3_state3',
                            state_size=3,
                            input_text=train_corpus_gen,
                            use_ngrams=True,
                            ngram_size=3)

CPU times: user 2min 11s, sys: 16.5 s, total: 2min 27s
Wall time: 5min 21s


In [88]:
ngram_model = TextGenerator(pg_chain=pg_chain,
                            pg_encoder=pg_encoder,
                            model_name='ngram_size3_state3',
                            state_size=4,
                            use_ngrams=True,
                            ngram_size=3)

In [48]:
ngram_model.make_sentences_for_t9('метод опорных векторов')

['хабр']

In [89]:
init_state = list(ngram_model.ngrams_split('метод опорных векторов'))
ngram_model.make_sentence(init_state[-4:], min_words=len(init_state) + 1)

['кто', 'тор', 'оро', 'ров']
prefix: [1920, 358, 738, 320]
init_state: [1920, 358, 738, 320]
4
4
4
4
4
4
4
4
4
4


In [94]:
ngram_model.pg_chain.walk(ngram_model.model_name, [1920, 358, 738, 320])

[]

In [1]:
from model import TextGenerator, PostgresStorage, ChainStorage, EncoderStorage

ModuleNotFoundError: No module named 'model'

In [39]:
pg = PostgresStorage.connect('vnkrtv.ru', port=15432, user='vnkrtv', password='Hardpass1337', dbname='vk')
query = pg.exec_query("SELECT text from posts WHERE owner_id = '-179625476'", [])

In [40]:
query = [row[0] for row in query]
query

['Ты погляди какая важная краля🙈👸🏼\nПацанчик прохавал, что к такой дамочке нужен особый подход♨️😏 Поэтому решил удивить ее необычным кальянчиком😈 Это [club132457402|Наносмок] детка😱👍🏼 Один1️⃣ покур и вуаля - телочка уже в ЗАГС зовёт💨🤩 Дело сделано😃\n\nКупил себе домой и пыхаешь спокойно😅 На карантине самое то😉👨🏻\u200d⚕️ \n\n#калик_рекламик',
 'Тут уважаемые пацаны из Санкт-Педымбурга записали кайфовый тречок про калик) Заценим творчество талантливых куряг😅 \n\nЗакачаешься, братва) Хоть на дискатеки ставь😹Танцуют все😈🎶',
 'Шок🤯 Краля ворует у пацана душу😨\nДа шучу😉 Паря увидел что телке стало плохо от нехватки дыма👩🏻💥 И оказал срочную кальянную помощь) Рот в рот конечно же🤣👍🏼 А ху*ли: и спас, и подкатил😀 Нормуль идея😈\n\nМилая сцена🥰 Счастья и здоровья молодым🤙🏼😅',
 'Легендарный кальянчик🎖\nПрямиком из рассказа Джоан Роулинг «Гарри Плотный и Кубок Угля»😨💨 Этот кубок получал победитель турнира трёх кальянщиков👍🏼😱 А вручал награду великий Дымблдор🤯 Одно слово - вещица знатная☝🏼♨️\n\nВот б

In [61]:
pg_chain = ChainStorage.connect('vnkrtv.ru', port=15432, user='vnkrtv', password='Hardpass1337', dbname='markov')
pg_encoder = EncoderStorage.connect('vnkrtv.ru', port=15432, user='vnkrtv', password='Hardpass1337', dbname='markov')

In [68]:
import re
import logging
from typing import Iterable

import nltk

class TextGenerator:
    pg_chain: ChainStorage
    pg_encoder: EncoderStorage
    encoder: WordsEncoder
    model_name: str
    state_size: int
    use_ngrams: bool
    ngram_size: int
    re_process: re.Pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ ]')

    def __init__(self,
                 pg_chain: ChainStorage,
                 pg_encoder: EncoderStorage,
                 model_name: str,
                 state_size: int,
                 input_text: Iterable = None,
                 use_ngrams: bool = False,
                 ngram_size: int = 3):
        self.pg_chain = pg_chain
        self.pg_encoder = pg_encoder
        self.model_name = model_name
        self.state_size = state_size
        self.use_ngrams = use_ngrams
        self.ngram_size = ngram_size
        if input_text:
            self.train_model(input_text)
        else:
            self.encoder = self.pg_encoder.load_encoder(model_name)
        logging.info(f'Load model: {self}')

    def train_model(self, input_text: Iterable):
        if self.use_ngrams:
            train_corpus = list(TextProcessor.get_ngram_gen(input_text, self.ngram_size))
        else:
            train_corpus = list(TextProcessor.get_text_gen(input_text))

        self.encoder = WordsEncoder()
        encoded_train_corpus = self.encoder.fit_encode(train_corpus)

        self.pg_encoder.add_encoder(self.model_name, self.encoder)
        self.pg_chain.add_model(self.model_name, encoded_train_corpus, self.state_size)
        logging.info(f'Add new model: {self}')

    def delete_model(self):
        self.pg_chain.delete_model(self.model_name)
        self.pg_encoder.delete_encoder(self.model_name)
        logging.info(f'Delete model: {self}')

    def ngrams_split(self, sentence: str) -> list:
        processed_sentence = self.re_process.sub('', sentence.lower())
        ngrams_list = [''.join(item) for item in nltk.ngrams(processed_sentence, self.ngram_size)]
        return ngrams_list

    def words_split(self, sentence: str) -> list:
        words_list = []
        for word in sentence.split():
            processed_word = self.re_process.sub('', word.lower())
            if processed_word:
                words_list.append(processed_word)
        return words_list

    def words_join(self, words_list: list) -> str:
        return ' '.join(words_list)

    def ngrams_join(self, ngrams_list: list) -> str:
        return ngrams_list[0][:-1] + ''.join([ngram[-1] for ngram in ngrams_list])

    def make_sentence(self, init_state: list, **kwargs):
        tries = kwargs.get('tries', 10)
        max_words = kwargs.get('max_words', None)
        min_words = kwargs.get('min_words', None)

        if init_state is not None:
            init_state = self.encoder.encode_words_list(init_state)
            prefix = init_state
            for word in prefix:
                if word == self.encoder.begin_word:
                    prefix = prefix[1:]
                else:
                    break
        else:
            prefix = []

        for _ in range(tries):
            codes_list = prefix + self.pg_chain.walk(self.model_name, init_state, 100)
            words_list = self.encoder.decode_codes_list(codes_list)
            if (max_words is not None and len(words_list) > max_words) or (
                    min_words is not None and len(words_list) < min_words):
                continue
            if self.use_ngrams:
                return self.ngrams_join(words_list)
            return self.words_join(words_list)
        return None

    def make_sentence_with_start(self, input_phrase: str, **kwargs):
        if self.use_ngrams:
            items_list = self.ngrams_split(input_phrase)
        else:
            items_list = self.words_split(input_phrase)
        items_count = len(items_list)

        if items_count == self.state_size:
            init_state = items_list

        elif 0 < items_count < self.state_size:
            init_state = [self.encoder.begin_word] * (self.state_size - items_count) + items_list
        else:
            init_state = [self.encoder.begin_word] * self.state_size

        return self.make_sentence(init_state, **kwargs)

    def make_sentences_for_t9(self,
                              beginning: str,
                              first_words_count: int = 1,
                              count: int = 30,
                              phrase_len: int = 5,
                              **kwargs) -> list:
        phrases = set()
        logging.info("Model '%s' - beginning: %s", self.model_name, beginning)
        for i in range(count):
            phrase = self.make_sentence_with_start(beginning, min_words=phrase_len, **kwargs)
            print(phrase)
            if phrase:
                words_list = phrase.split()
                if 1 < len(words_list) >= phrase_len:
                    phrases.add(" ".join(words_list[first_words_count:]))
        logging.info("Model '%s' - executed: %s", self.model_name, '\n'.join(phrases))
        return list(phrases)

    def __repr__(self):
        return '<TextGenerator: model_name=%s, state_size=%s, ngrams=%s>' % (
            self.model_name,
            self.state_size,
            str(self.use_ngrams) + ', ngram_size=' + str(self.ngram_size) if self.use_ngrams else self.use_ngrams)

In [69]:
ngram_model = TextGenerator(pg_chain=pg_chain,
                            pg_encoder=pg_encoder,
#                             input_text=query,
                            model_name='kalik',
                            state_size=4,
                            use_ngrams=True,
                            ngram_size=3)

In [70]:
a = ngram_model.make_sentence_with_start('пыхтячит')

In [72]:
a

'требуем реформу образования петров ты почему не место на этот миф придумали жижабляди чтоб не растерял'

In [2]:
from typing import Generator

import psycopg2


class PostgresStorage:
    """
    Base class for working with PostgreSQL
    """

    conn: psycopg2.extensions.connection

    def __init__(self, conn):
        self.conn = conn

    @classmethod
    def connect(cls,
                host: str,
                port: int = 5432,
                user: str = 'postgres',
                password: str = 'password',
                dbname: str = 'postgres'):
        return cls(conn=psycopg2.connect(
            host=host, port=port, user=user, password=password, dbname=dbname)
        )

    def exec_query(self, query: str, params: list) -> Generator:
        cursor = self.conn.cursor()
        cursor.execute(query, params)
        return cursor.fetchall()

    def exec(self, sql: str, params: list):
        cursor = self.conn.cursor()
        try:
            cursor.execute(sql, params)
        except psycopg2.Error as e:
            self.conn.rollback()
            raise e
        self.conn.commit()


In [4]:
pg = PostgresStorage.connect('vnkrtv.ru', port=15432, user='vnkrtv', password='Hardpass1337', dbname='vnkrtv')
query = pg.exec_query("SELECT text from posts INNER JOIN habs on posts.post_id = habs.post_id WHERE hab = 'Машинное обучение' LIMIT 2200", [])

In [7]:
words_count = 0
for text in query:
    words_count += len(text[0].split())
words_count

3703249