In [5]:
import pandas as pd


In [37]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, TimeDistributed, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Представим, что у нас есть набор данных с предложениями
english_sentences = pd.read_csv("translations.csv")['ru'][:10]
tatar_sentences = pd.read_csv("translations.csv")['tat'][:10]

# Токенизация текста
tokenizer_eng = Tokenizer()
tokenizer_tat = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
tokenizer_tat.fit_on_texts(tatar_sentences)

# Преобразование текста в последовательности чисел
sequences_eng = tokenizer_eng.texts_to_sequences(english_sentences)
sequences_tat = tokenizer_tat.texts_to_sequences(tatar_sentences)

# Паддинг последовательностей для выравнивания длины
max_len_eng = max(len(x) for x in sequences_eng)
max_len_tat = max(len(x) for x in sequences_tat)
max_len = max(max_len_eng, max_len_tat)  # Выровнять длину последовательностей по максимальному значению

padded_eng = pad_sequences(sequences_eng, maxlen=max_len, padding='post')
padded_tat = pad_sequences(sequences_tat, maxlen=max_len, padding='post')

# Параметры модели
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_tat = len(tokenizer_tat.word_index) + 1
embedding_dim = 128
units = 128

# Построение модели
model = Sequential([
    Embedding(vocab_size_eng, embedding_dim, input_length=max_len),
    Bidirectional(LSTM(units, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(units, return_sequences=True)),
    TimeDistributed(Dense(vocab_size_tat, activation='softmax'))
])



In [39]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Преобразование выходных данных для обучения
padded_tat = np.expand_dims(padded_tat, -1)

# Увеличение количества эпох обучения
model.fit(padded_eng, padded_tat, epochs=100, batch_size=2, validation_split=0.2)

# Функция для перевода текста
def translate_sentence(sentence):
    # Токенизация и паддинг ввода
    sequence = tokenizer_eng.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    
    # Предсказание
    prediction = model.predict(padded_sequence)
    predicted_sentence = np.argmax(prediction, axis=-1)
    
    # Преобразование предсказанных индексов в слова, убираем '[UNK]'
    translated_sentence = ' '.join(tokenizer_tat.index_word.get(i) for i in predicted_sentence[0] if i != 0)
    print('Токенизированная последовательность:', sequence)
    print('Предсказанные индексы:', predicted_sentence[0])
    return translated_sentence

# Тест функции перевода
user_sentence = "The media routinely talk about a historic turn, but people who have been involved in Korean affairs for decades do not share this optimism."
translated_sentence = translate_sentence(user_sentence)
print('Перевод:', translated_sentence)

Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 521ms/step - accuracy: 0.3134 - loss: 2.5892 - val_accuracy: 0.8857 - val_loss: 0.7649
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.4687 - loss: 2.1461 - val_accuracy: 0.8351 - val_loss: 0.8194
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.2796 - loss: 2.4542 - val_accuracy: 0.8478 - val_loss: 0.7940
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.3933 - loss: 2.0091 - val_accuracy: 0.8351 - val_loss: 0.8173
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.3057 - loss: 2.0969 - val_accuracy: 0.7845 - val_loss: 0.8956
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.2814 - loss: 2.1083 - val_accuracy: 0.8224 - val_loss: 0.8347
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━

In [40]:
def translate_sentence(sentence):
    # Токенизация и паддинг ввода
    sequence = tokenizer_eng.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    
    # Предсказание
    prediction = model.predict(padded_sequence)
    predicted_sentence = np.argmax(prediction, axis=-1)
    
    # Преобразование предсказанных индексов в слова, убираем '[UNK]'
    translated_sentence = ' '.join(tokenizer_tat.index_word.get(i) for i in predicted_sentence[0] if i != 0)
    return translated_sentence

# Тест функции перевода
user_sentence = "Также преступления замышляются по дороге из школы домой."
translated_sentence = translate_sentence(user_sentence)
print(translated_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
шулай ук җинаятьләрне мәктәптән өйгә кайтканда уйлап табалар


In [41]:
model.save('translation_model.keras')

import pickle
with open('tokenizer_eng.pkl', 'wb') as handle:
    pickle.dump(tokenizer_eng, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('tokenizer_tat.pkl', 'wb') as handle:
    pickle.dump(tokenizer_tat, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, TimeDistributed, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
# Представим, что у нас есть набор данных с предложениями
english_sentences = pd.read_csv("concatenated_dataset.csv")['en'][:10]

tatar_sentences = pd.read_csv("concatenated_dataset.csv")['tat'][:10]

# Токенизация текста
tokenizer_eng = Tokenizer()
tokenizer_tat = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
tokenizer_tat.fit_on_texts(tatar_sentences)

# Преобразование текста в последовательности чисел
sequences_eng = tokenizer_eng.texts_to_sequences(english_sentences)
sequences_tat = tokenizer_tat.texts_to_sequences(tatar_sentences)

# Паддинг последовательностей для выравнивания длины
max_len_eng = max(len(x) for x in sequences_eng)
max_len_tat = max(len(x) for x in sequences_tat)
max_len = max(max_len_eng, max_len_tat)  # Выровнять длину последовательностей по максимальному значению

padded_eng = pad_sequences(sequences_eng, maxlen=max_len, padding='post')
padded_tat = pad_sequences(sequences_tat, maxlen=max_len, padding='post')

# Параметры модели
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_tat = len(tokenizer_tat.word_index) + 1
embedding_dim = 128
units = 128
# Построение модели
model = Sequential([
    Embedding(vocab_size_eng, embedding_dim, input_length=max_len),
    Bidirectional(LSTM(units, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(units, return_sequences=True)),
    TimeDistributed(Dense(vocab_size_tat, activation='softmax'))
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Преобразование выходных данных для обучения
padded_tat = np.expand_dims(padded_tat, -1)

# Увеличение количества эпох обучения
model.fit(padded_eng, padded_tat, epochs=100, batch_size=2, validation_split=0.2)

# Сохранение модели и токенизаторов
model.save('translation_model.h5')

import pickle
with open('tokenizer_eng.pkl', 'wb') as handle:
    pickle.dump(tokenizer_eng, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('tokenizer_tat.pkl', 'wb') as handle:
    pickle.dump(tokenizer_tat, handle, protocol=pickle.HIGHEST_PROTOCOL)


Epoch 1/100




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 643ms/step - accuracy: 0.3805 - loss: 4.7145 - val_accuracy: 0.9245 - val_loss: 3.0002
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.7270 - loss: 2.9302 - val_accuracy: 0.9245 - val_loss: 0.4752
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.7179 - loss: 1.8170 - val_accuracy: 0.9245 - val_loss: 0.4525
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 0.6987 - loss: 1.6637 - val_accuracy: 0.9245 - val_loss: 0.5271
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.6903 - loss: 1.6620 - val_accuracy: 0.9245 - val_loss: 0.5716
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.7044 - loss: 1.6218 - val_accuracy: 0.9245 - val_loss: 0.5202
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━



In [30]:
from flask import Flask, request, jsonify
import tensorflow as tf
from tensorflow.keras.models import load_model
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Загрузка модели и токенизаторов
model = load_model('translation_model.h5')
tokenizer_eng = Tokenizer()
tokenizer_tat = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
tokenizer_tat.fit_on_texts(tatar_sentences)

# Оптимизация функции предсказания
@tf.function
def fast_predict(model, input_sequence):
    return model(input_sequence)

# Функция для перевода текста
def translate_sentence(sentence):
    # Токенизация и паддинг ввода
    sequence = tokenizer_eng.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    
    # Предсказание
    prediction = fast_predict(model, padded_sequence)
    predicted_sentence = np.argmax(prediction, axis=-1)
    
    # Преобразование предсказанных индексов в слова, убираем '[UNK]'
    translated_sentence = ' '.join(tokenizer_tat.index_word.get(i, '') for i in predicted_sentence[0] if i != 0)
    return translated_sentence

# Создание приложения Flask
app = Flask(__name__)

@app.route('/translate', methods=['POST'])
def translate():
    data = request.get_json()
    sentence = data.get('sentence')
    if not sentence:
        return jsonify({'error': 'No sentence provided'}), 400
    
    translated_sentence = translate_sentence(sentence)
    return jsonify({'translated_sentence': translated_sentence})

if __name__ == '__main__':
    app.run(debug=True)




 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [8]:
# Задайте URL для получения списка книг по жанру
url_genre = "http://127.0.0.1:5000/books/by_genre/Fiction"  # Замените 'Fiction' на нужный жанр

# Отправьте GET запрос
response_genre = requests.get(url_genre)

# Проверьте статус ответа и выведите результат
if response_genre.status_code == 200:
    print("Books in genre:")
    print(response_genre.json())
else:
    print("Error retrieving books by genre:", response_genre.status_code)


Error retrieving books by genre: 500


In [3]:
# Load the model and tokenizers
from flask import Flask, request, jsonify
import tensorflow as tf
from tensorflow.keras.models import load_model
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
model = load_model('translation_model.h5')
with open('tokenizer_eng.pkl', 'rb') as handle:
    tokenizer_eng = pickle.load(handle)
with open('tokenizer_tat.pkl', 'rb') as handle:
    tokenizer_tat = pickle.load(handle)

def translate_sentence(sentence):
    # Токенизация и паддинг ввода
    sequence = tokenizer_eng.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, padding='post')
    
    # Предсказание
    prediction = model.predict(padded_sequence)
    predicted_sentence = np.argmax(prediction, axis=-1)
    
    # Преобразование предсказанных индексов в слова, убираем '[UNK]'
    translated_sentence = ' '.join(tokenizer_tat.index_word.get(i) for i in predicted_sentence[0] if i != 0)
    return translated_sentence

# Тест функции перевода
user_sentence = "Также преступления замышляются по дороге из школы домой."
translated_sentence = translate_sentence(user_sentence)
print(translated_sentence)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
очрашу очрашу белән
