In [12]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, Dense
from tensorflow.keras.models import Sequential
import numpy as np
import pandas as pd

In [13]:
# Sample data preparation (assuming you have your data in a pandas DataFrame)
# Replace this with your actual DataFrame
data = pd.DataFrame({
    'input_text': ['i ike eting cakes', 'i go to shool everydy'],
    'target_text': ['i like eating cakes', 'i go to school everyday']
})

data = pd.read_csv('/kaggle/input/dop-test-files/errors.csv').drop(columns=['Unnamed: 0'])
data

Unnamed: 0,label,preds
0,иностранный агент,иностранный аген
1,свидетельствуют о проблемах с печенью,свидетельствуют о праблемах спецнью
2,найдите способ быть полезными другим людям,найдиче сьпособ быть полезном другим людем
3,я уже поставил белье в стирку,я уже поставил бельо встиру
4,круглый мяч,круглый мядчь
...,...,...
1475,летим в отпуск на гавайи,влетим в отпуск наговаи
1476,кротовая настойка,кротовая на стойка
1477,думаю нам пора расходиться по домам,домаю ном порарасходится по домам
1478,красивые цветы украшают сад,красивые цветы укрошают сад


In [31]:
# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['preds'].tolist() + data['label'].tolist())

# Convert text to sequences
input_sequences = tokenizer.texts_to_sequences(data['preds'].tolist())
target_sequences = tokenizer.texts_to_sequences(data['label'].tolist())

# Padding sequences to the same length
max_seq_len = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in target_sequences))
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='post')
padded_target_sequences = pad_sequences(target_sequences, maxlen=max_seq_len, padding='post')

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 1024
rnn_units = 512

In [32]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_len))
model.add(LSTM(rnn_units, return_sequences=True))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare target data for training
padded_target_sequences = np.expand_dims(padded_target_sequences, -1)

# Train the model
model.fit(padded_input_sequences, padded_target_sequences, epochs=50, batch_size=50)

Epoch 1/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.6166 - loss: 5.2630
Epoch 2/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.7100 - loss: 2.6536
Epoch 3/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.7089 - loss: 2.5459
Epoch 4/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.7055 - loss: 2.4733
Epoch 5/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.7180 - loss: 2.2910
Epoch 6/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.7084 - loss: 2.2849
Epoch 7/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.7151 - loss: 2.1496
Epoch 8/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.7204 - loss: 2.0189
Epoch 9/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7e24120bfcd0>

In [39]:
def predict(input_text):
    # Preprocess the input text
    input_seq = tokenizer.texts_to_sequences([input_text])
    padded_input_seq = pad_sequences(input_seq, maxlen=max_seq_len, padding='post')

    # Predict the output sequence
    predictions = model.predict(padded_input_seq)
    predicted_sequence = np.argmax(predictions, axis=-1)
    
    # Convert the predicted sequence back to text
    decoded_sentence = ' '.join(tokenizer.index_word.get(index, '') for index in predicted_sequence[0])
    
    return decoded_sentence.strip()

# Example prediction
corrected_text = predict('мотивация должна быыыть всегдаааа')
print('Corrected Text:', corrected_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Corrected Text: 
