## Get Texts

In [12]:
%autosave 60
import pandas as pd
import numpy as np
import json
import pickle
import os
from tqdm import tqdm
import re
import emoji
import matplotlib.pyplot as plt

from nltk import wordpunct_tokenize # our main tokenizer

Autosaving every 60 seconds


In [4]:
path_comments = ''

In [5]:
total_text_values = np.array([])
files = os.listdir(path_comments)

for filename in tqdm(files, position=0, leave=True):    
    comments_values = pd.read_csv(os.path.join(path_comments, filename), usecols=['text'])['text'].to_numpy()
    total_text_values = np.hstack((total_text_values, comments_values))
    
total_text_values = pd.Series(total_text_values, name='text') # Convert to pandas.Series for apply function

100%|████████████████████████████████████████████████████████████████████████████████| 102/102 [00:04<00:00, 24.79it/s]


In [6]:
total_text_values = total_text_values.sample(300000)

## Preprocessing

In [7]:
def text_preprocessing(string):
    
    rus = re.compile("[а-яА-Я]+")
    string = string.lower()

    return string

In [8]:
total_text_values = total_text_values.apply(text_preprocessing)

In [9]:
total_text_values[total_text_values != ''].to_csv(os.path.join(os.getcwd(), 'comments_preprocessed.csv'), index=False)

In [10]:
total_text_values = pd.read_csv(os.path.join(os.getcwd(), 'comments_preprocessed.csv'))
print(total_text_values.shape)

(300000, 1)


## Convert text to token

In [11]:
class WordPunctEmojiTokenizer:
    
    def __init__(self, min_word_freq=5, max_word_freq=0.85, parse_emoji=False):
        
        """
        Attributes:
        --------------
        min_word_freq: float
            minimal number of documents wich are contains word
            
        max_word_freq: float
            maximal fraction of documents wich are contains word
            
        Functions:
        --------------
        
        fit:
        word_to_tok:
        tok_to_word:
        
        Notes: 
            free indexes:0 - PAD token, 1 - SOS token, 2 - EOS token, 3 - UNK token
        --------------
        
        """

        from collections import defaultdict
        import numpy as np
        
        self.total_samples = 0
        self.tok_unk = 0
        self.punct = np.array(['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',',
                               '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', 
                               ']', '^', '_', '`', '{', '|', '}', '~'])
        
        self.min_word_freq = min_word_freq
        self.max_word_freq = max_word_freq
        self.parse_emoji = parse_emoji
                
        self.word_total_freq = defaultdict(int) # here we store total word frequency
        self.word_sentence_freq = defaultdict(int) # here we store word frequency in each sentence
        
        self.word_tok_dict = defaultdict(int) # dict to convert word to token
        self.tok_word_dict = defaultdict(str) # dict to convert token to word
        
        
    def tokenize_sentence(self, sentence):
 
        tokens_final = []
        for token in wordpunct_tokenize(sentence):
            
            if re.findall('[^\w\d\s]', token): # this part check for stacked punctuation 
                if np.all(np.isin(np.array(list(token)),  self.punct)):
                    tokens_final.extend(list(token))
                else:
                    tokens_final.append(token)
            else:
                tokens_final.append(token)

        return tokens_final
    
    
    def tokenize_emoji_sentence(self, sentence):
        # Tokenize sentence with emojies in correct way
        from nltk import wordpunct_tokenize
        
        tokens = []
        for tok in wordpunct_tokenize(sentence):
            if emoji.get_emoji_regexp().search(tok):
                tokens.extend([i for i in list(tok) if i != ''])
                
            elif re.findall('[^\w\d\s]', tok):
                if np.all(np.isin(np.array(list(tok)),  self.punct)):
                    tokens.extend(list(tok))
                else:
                    tokens.append(tok)
            else:
                tokens.append(tok)

        return tokens
     
        
    def fit(self, corpus):
        
        """
        corpus: np.ndarray
            array of arrays of strings: array(array(str), array(str), ...)
            
        """
        
            
        import numpy as np
        from tqdm import tqdm
        from nltk import wordpunct_tokenize # our main tokenizer
        from collections import Counter
        
        
        if not isinstance(corpus, np.ndarray):
            raise TypeError(f'corpus has to be numpy.ndarray type, got {type(corpus)}')
        self.total_samples = corpus.shape[0]
        
        iter_ = 0
        for sentence in tqdm(corpus, position=0, leave=True):
            if isinstance(sentence, np.ndarray) or isinstance(sentence, list):
                sentence = sentence[0]
                
            if not isinstance(sentence, str):
                raise TypeError(f'Sentence must be string, found {type(sentence)} type on index {iter_}')
                        
                    
            # Emoji dealing with part
            if self.parse_emoji:
                tokens = self.tokenize_emoji_sentence(sentence)
                
            else:
                tokens = self.tokenize_sentence(sentence)
            
            tokens_freq = Counter(tokens)
            for key in tokens_freq.keys():
                self.word_total_freq[key] += tokens_freq[key] # total frequency 
                self.word_sentence_freq[key] += 1 # frequency in the sentence
                
            iter_ += 1
    
    
   
    
    def word_to_tok(self, corpus):
        
        """
        convert your corpus to tokens with settings
        """
        
            
        import numpy as np
        from collections import defaultdict
        from tqdm import tqdm
        from nltk import wordpunct_tokenize # our main tokenizer
        from collections import Counter
        
        self.word_tok_dict = {k: v for k, v in sorted(self.word_sentence_freq.items(), key=lambda item: item[1], reverse=True) 
                              if v >= self.min_word_freq and v <= int(self.max_word_freq * self.total_samples)}
        
        self.word_tok_dict = {k: idx + 4 for idx, k in enumerate(self.word_tok_dict.keys())}
        # (idx + 4) becouse 
        # 0 - PAD token, 1 - SOS token, 2 - EOS token, 3 - UNK token

        
        self.tok_word_dict = {v: k for k, v in self.word_tok_dict.items()} # to reverse our tokenization
        
        def sentence_to_tokens(sentence):
            if isinstance(sentence, np.ndarray):
                sentence = sentence[0]

            if not isinstance(sentence, str):
                raise TypeError(f'sentence has to be str, got {type(sentence)}')
                
            if self.parse_emoji:
                tokens = self.tokenize_emoji_sentence(sentence)
            else:
                tokens = self.tokenize_sentence(sentence)
                
            encoded_sentence = []
            
            for tok in tokens:
                if tok in self.word_tok_dict.keys():
                    encoded_sentence.append(self.word_tok_dict[tok])
                else:
                    encoded_sentence.append(0)
                    
            return np.array(encoded_sentence)
        
        return np.array([sentence_to_tokens(i) for i in tqdm(corpus, position=0, leave=True)], dtype='object')
        
        
        
    def tok_to_word(self, tokens):
        
        """
        conver tokens on pretrained class to words back
        
        """
            
        import numpy as np
        from collections import defaultdict
        from tqdm import tqdm
        from nltk import wordpunct_tokenize # our main tokenizer
        from collections import Counter
        
        def tokens_to_sentence(toks):
            
            sentence = ''
            for tok in toks:
                if tok in self.tok_word_dict.keys():
                    sentence += self.tok_word_dict[tok]
                    sentence += ' '
                    
#                 self.PAD = 0
#                 self.SOS = 1
#                 self.EOS = 2
#                 self.UNK = 3

                elif tok == 0:
                    sentence += '<PAD>'
                    sentence += ' '
                    
                elif tok == 1:
                    sentence += '<SOS>'
                    sentence += ' '
                    
                elif tok == 2:
                    sentence += '<EOS>'
                    sentence += ' '
                    
                elif tok == 3:
                    sentence += '<UNK>'
                    sentence += ' '
                    
                else: # other unknown cases
                    sentence += '<UNK>'
                    sentence += ' '
                    
            return np.array(sentence)
        
        if isinstance(tokens[0], np.ndarray):
            return np.array([np.array(tokens_to_sentence(i)) for i in tqdm(tokens, position=0, leave=True)])
        
        else:
            return np.array(tokens_to_sentence(tokens))
    

## Get tokens from raw text

In [14]:
# build vocab
vocab = WordPunctEmojiTokenizer(min_word_freq=1, max_word_freq=1, parse_emoji=True)
vocab.fit(total_text_values.values)

100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [02:05<00:00, 2381.74it/s]


In [15]:
# convert raw text to tokens
tokens = vocab.word_to_tok(total_text_values.values)

100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [02:06<00:00, 2375.00it/s]


In [None]:
# TO DO: next release
class BatchTokenGenerator:
    
    """
    
    to do паддить до: (max len | true max len в батче)
    """
    
    def __init__(self, batch_size=64, padding=True):
        
        
        self.batch_size = batch_size
        # self.max_seq_len = max_seq_len max_seq_len=None
        self.padding = padding

        self.PAD = 0
        self.SOS = 1
        self.EOS = 2
        self.UNK = 3
        
    
    def get_batch(self, token_sequence):
        
        import numpy as np
        
        if not isinstance(token_sequence, np.ndarray):
            raise TypeError(f'token_sequence has to be numpy.ndarray, got {type(token_sequence)}')
        
        for _iter in range((token_sequence.shape[0] // self.batch_size) + 1):
            
            batch_tokens = token_sequence[_iter * self.batch_size: (_iter + 1) * self.batch_size]
                        
            # padding for maximal len in batch
            
#             batch_tokens = np.array([np.hstack((np.array([self.SOS]), seq, np.array([self.EOS]),
#                                                np.zeros(max_seq_len - len(seq), dtype=np.int64))) 
#                                     for seq in batch_tokens])
            
#             batch_tokens = np.array([np.hstack((np.array([self.SOS]), seq, np.array([self.EOS]))) 
#                                     for seq in batch_tokens])

            batch_tokens = np.array([seq for seq in batch_tokens])
            yield batch_tokens

## Text generator

In [None]:
# from keras.models import Model
# from keras.layers import Dense, Input, GRU
# from keras.layers.embeddings import Embedding

In [20]:
import numpy as np

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, GRU, LSTM
from tensorflow.keras.layers import Embedding, Dropout
from tensorflow.keras.optimizers import RMSprop
from keras.utils import np_utils

from tensorflow.keras.callbacks import LambdaCallback

In [16]:
vocab_size = len(vocab.word_total_freq.keys()) + 4 # + 4 becouse of 4 additional tokens inside of tokenizer
print(f'Total comments word size: {vocab_size}')

Total comments word size: 322640


In [None]:
# TO DO: next release
def seq_generator(sequence, batch_size=64):
    
    """sequence is tokenized text
    
    """
    
    seq_generator = BatchTokenGenerator(batch_size=batch_size)
    generator = seq_generator.get_batch(sequence)
    
    for batch_data in generator:
        seq_len = batch_data.shape[-1] # last dimansion (has to be n columns)
        if seq_len < 3:
            continue
            
        for idx in range(seq_len - 1):
            try:
                y = batch_data[:, idx + 1]
            except:
                continue
            
#             if not np.any(y):
#                 continue
                
            X = batch_data[:, 0: idx + 1]# / float(vocab_size)
            y = np_utils.to_categorical(y, num_classes=vocab_size)

            yield X, y

In [None]:
# Create dataset from tokenized seq

X, y = [], []
for text in tqdm(tokens):
    seq_len = len(text) - 1
    for idx in range(seq_len):
        X.append(text[: seq_len - idx])
        y.append(np_utils.to_categorical(text[seq_len - idx], num_classes=vocab_size))

In [None]:
g = seq_generator(tokens, batch_size=1)

In [None]:
X, y = next(g)
print(np.argmax(y))
print(vocab.tok_to_word([np.argmax(y)]))

## Model training

In [17]:
from __future__ import print_function
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

Using TensorFlow backend.


In [18]:
total_text_values = pd.read_csv(os.path.join(os.getcwd(), 'comments_preprocessed.csv'), nrows=10000)

In [19]:
text = ' |||||| '.join(i for i in tqdm(total_text_values['text']))

100%|███████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 2484041.46it/s]


In [20]:
chars = sorted(list(set(text)))
print('total chars: ', len(chars))

total chars:  800


In [21]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [22]:
maxlen = 50
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 305909


In [24]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [25]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [26]:
def sample(preds, temperature=1.0):
    
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [27]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
        
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [28]:
from tensorflow.keras.callbacks import ModelCheckpoint

filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')

In [29]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                              patience=1, min_lr=0.001)

In [30]:
callbacks = [print_callback, checkpoint, reduce_lr]

In [31]:
model.fit(x, y, batch_size=128, epochs=5, callbacks=callbacks)

Train on 305909 samples
Epoch 1/5
----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "... txaek gnacek sahmanin krvek. inknakam 150 hogi"
... txaek gnacek sahmanin krvek. inknakam 150 hoging the me in the wante me the noing the me the the the the the me me me non to recer and me the me me wonk the the menter for the on the me me me the for fante me to me the want the me the me to worle the to the winte the the not the me nette to me to me me me to me the me to me me the serte me me the werte the me the wante for for and the me not the noong the to want to me to want the the the me 
----- diversity: 0.5
----- Generating with seed: "... txaek gnacek sahmanin krvek. inknakam 150 hogi"
... txaek gnacek sahmanin krvek. inknakam 150 hogide |||||| черес бы с канале 
















































































































































































а от разное посленн

атление в подыгрывании рапиду и намеренном принижели в тренди не подписчиками на не поставить всегда с подписчиками страны на то поставить на так с комментарий в страны подписчиками |||||| когда не под стороны подписчиками |||||| как всегда так то видео в конце не поставить на такой под ставить не подписчика |||||| посмотрит посмотрит подписчикам в подписчикам в такой россии подписчиками подписчиками не подписчика |||||| который не под подписчика
----- diversity: 0.5
----- Generating with seed: "атление в подыгрывании рапиду и намеренном приниже"
атление в подыгрывании рапиду и намеренном принижелось в тому подписчика, старый подолжение и под хочет в трендей вот своих парена |||||| потом было привет надо по которое вам посмотрит всех достонные ты классное зачем поставью по поставилю в так и в сторона по просто хотит посмотрит обстой всего |||||| как лайк достаня, что должен больше не участе как на тренди в россии в делай подумать в полиция не перестали и дивлати |||||| слушать так стал

<tensorflow.python.keras.callbacks.History at 0x1972b705128>

In [34]:
def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - maxlen - 1)
    #print(f'start index is {start_index}')
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    #print(f'start sentence is {sentence}')
    #generated += sentence
    for i in range(length):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char
    return generated

In [39]:
print(f'Generated text is: {generate_text(100, 0.5)}')

Generated text is: а жданный людей видео почему видео |||||| ничего все фидет не выпускать видео благода с какая и подп
