In [2]:
import numpy as np
import pandas as pd
from IPython.display import display
import os
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import pickle

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\пк\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Global variables

In [3]:
QUESTION_MAX_WORDS = 20
EXTRACT_MAX = 200000
WEIGHTS_LOCATION = "weights/model2/"
OUTPUT_FILE = "data/discord_conversation_chopy.csv"

## Tokenization (words selection), deleting stop-words, lemmatization, stemming settings

In [None]:
class NLP:
    @staticmethod
    def extract_tokens(text):
        tokens = [word.lower() for sent in sent_tokenize(text) for word in word_tokenize(sent)]
        return tokens
    
    @staticmethod
    def extract_sentences(text):
        return sent_tokenize(text)
    
    @staticmethod
    def extract_sentences_normalized(text):
        sentences = NLP.extract_sentences(text)
        for i in range(len(sentences)):
            tokens = NLP.extract_tokens(sentences[i])
            sentences[i] = ' '.join(tokens)
        return sentences
    
    @staticmethod
    def normalize_words(array):
        result = array.copy()
        for i in range(len(result)):
            text = re.sub('[^\w$ ]', '', result[i])
            try:
                result[i] = NLP.extract_sentences_normalized(text)[0]
            except:
                result[i] = None
        return np.array(result)
    
# Testing
print(NLP.normalize_words(["Привет! Как дела? Что думаешь о погоде???", 
                           "Хорошая погода...", 
                           "Yep! It's right!"]))

## Display 'discord_conversation.csv' table

In [None]:
df = pd.read_csv(OUTPUT_FILE, delimiter='$')
df = df.dropna(how='any',axis=0) 
display(df)

## Dataset creation from 'discord_conversation.csv'

In [None]:
dataset = df.to_numpy()
pred_questions = NLP.normalize_words(dataset[:, 0].tolist())
pred_answers = NLP.normalize_words([s + " $" for s in dataset[:, 1].tolist()])
print(pred_questions)
print(pred_answers)

## Extract 'question' -> 'answer word' data from each sentences of text

In [None]:
questions = []
answers = []

for i in range(len(pred_questions)):
    pq = pred_questions[i]
    pa = pred_answers[i]
    if pq is None or pa is None:
        continue
    
    words = pa.split()
    
    for i in range(len(words)):
        q = f"{pq} {' '.join(words[:i])}"
        a = words[i]
        questions.append(q)
        answers.append(a)

for i in range(min(30, len(questions))):
    print(f"[{questions[i]}] -> [{answers[i]}]")

questions, answers = shuffle(questions, answers)

questions = questions[:EXTRACT_MAX]
answers = answers[:EXTRACT_MAX]

print(f"\nExtracted: {len(questions)}")

# Training 'question' -> 'answer' model on different methods

## LSTM (Long Short-Term Memory)

### Data preparing

In [14]:
tokenizer = Tokenizer(filters='!"#%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(questions + answers)
reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) #reverse map to get predicted word

vocab_size = len(tokenizer.word_index) + 1 # '+ 1' for unknown words
data_size = len(questions)

print(f"Model contains {vocab_size} words in dictionary")

#Get numbers sequences by tokenizer
Q_temp = tokenizer.texts_to_sequences(questions)
A_temp = tokenizer.texts_to_sequences(answers)
Q = pad_sequences(Q_temp, padding='post', maxlen=QUESTION_MAX_WORDS)
A = pad_sequences(A_temp, padding='post', maxlen=1)

print(f"\nDataset size is: {data_size}")
print(f"Questions shape: {Q.shape}")
print(f"Answers shape: {A.shape}")

Model contains 18849 words in dictionary

Dataset size is: 59557
Questions shape: (59557, 20)
Answers shape: (59557, 1)


### Model load from folder

In [11]:
model = tf.keras.models.load_model(WEIGHTS_LOCATION)
model.summary()

with open(WEIGHTS_LOCATION + 'dict.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
vocab_size = len(tokenizer.word_index)
print("Loaded model contains " + str(vocab_size) + " words")

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 64)            1206336   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                33024     
_________________________________________________________________
dense (Dense)                (None, 18849)             1225185   
Total params: 2,464,545
Trainable params: 2,464,545
Non-trainable params: 0
_________________________________________________________________
Loaded model contains 18848 words


### Model creation

In [16]:
model = tf.keras.Sequential([
    # [26, 125, 1, ..., 0, 0, 0] * QUESTION_MAX_WORDS
    tf.keras.layers.Embedding(vocab_size, 64, input_length=QUESTION_MAX_WORDS),
    tf.keras.layers.LSTM(64),
    # [0.1, 0.15, 0.35, ..., 0.02, 0.13] * vocab_size
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

A = to_categorical(A, num_classes=vocab_size)
print(f"Categorical answers shape: {A.shape}\n")

model.summary()

Categorical answers shape: (59557, 18849)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 64)            1206336   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                33024     
_________________________________________________________________
dense (Dense)                (None, 18849)             1225185   
Total params: 2,464,545
Trainable params: 2,464,545
Non-trainable params: 0
_________________________________________________________________


### Model training

In [None]:
model.fit(Q, A, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10

### Model save

In [18]:
model.save(WEIGHTS_LOCATION)
with open(WEIGHTS_LOCATION + 'dict.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)



INFO:tensorflow:Assets written to: weights/model2/assets


INFO:tensorflow:Assets written to: weights/model2/assets


### Model test

In [None]:
# Define the input text and the desired number of words in the output
input_text = "Тест"
num_words = 50
max_repeats = 3

output_text = input_text
current_repeats = 1
last_word = None

# Loop until the desired number of words is reached
while num_words > 0:
    # Encode the input text and pad it to the desired length
    encoded_input = pad_sequences(tokenizer.texts_to_sequences(
        NLP.extract_sentences_normalized(output_text)), padding='post', maxlen=QUESTION_MAX_WORDS)

    # Use the model to predict the probability distribution over the next word
    probs_output = model.predict(encoded_input)[0]

    # Choose the most likely word (excluding the padding token)
    index = np.argmax(probs_output[1:]) + 1

    # If a valid word was predicted, append it to the output text
    if index != 0:
        word = reverse_word_map[index]
        if word == "$":
            break

        if last_word != None:
            if last_word == word:
                current_repeats += 1
            else:
                current_repeats = 1
                
        if current_repeats > max_repeats:
            break

        output_text += " " + word
        last_word = word

    # Decrement the word counter
    num_words -= 1

# Print the output text
print("Входной текст:", input_text)
print("Выходной текст:", output_text)