## Классификатор языков 

In [54]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.data import Dataset
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import Dense, Dropout, TextVectorization, Embedding, Bidirectional, LSTM

In [55]:
def get_mappings(langs):
    words = []
    tags = []
    
    for lang in langs:
        with open(f"{lang}.txt") as fin:
            file_words = fin.read().splitlines()
            tags.extend([lang] * len(file_words))
            words.extend(file_words)

    return words, tags

In [61]:
langs = ["dutch", "hungarian", "portugese"]
words, tags = get_mappings(langs)

num_train_words = int(0.8 * len(words))
train_words, train_tags = words[:num_train_words], tags[:num_train_words]
test_words, test_tags = words[num_train_words:], tags[num_train_words:]

train_dataset = Dataset.from_tensor_slices((tf.convert_to_tensor(train_words, dtype=tf.string),
                                            tf.convert_to_tensor(train_tags, dtype=tf.string)))

test_dataset = Dataset.from_tensor_slices((tf.convert_to_tensor(test_words, dtype=tf.string),
                                           tf.convert_to_tensor(test_tags, dtype=tf.string)))

In [66]:
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b'ik'
label:  b'dutch'


In [82]:
ALPHABETS_SIZE = 1000
encoder = TextVectorization(max_tokens=ALPHABETS_SIZE, split="character")
encoder.adapt(words)

vocab = np.array(encoder.get_vocabulary())
most_frequent_chars = vocab[:10]
least_frequent_chars = vocab[-10:]
print(f'Top-10 most frequent characters: {most_frequent_chars}')
print(f'Top-10 least frequent characters: {least_frequent_chars}')

Top-10 most frequent characters: ['' '[UNK]' 'e' 'a' 'r' 'o' 'n' 't' 'i' 's']
Top-10 least frequent characters: ['ú' 'ê' 'ű' 'â' 'ë' 'ï' 'ô' 'è' 'õ' '7']


In [74]:
model = Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_14 (Text  (None, None)             0         
 Vectorization)                                                  
                                                                 
 embedding_10 (Embedding)    (None, None, 64)          3008      
                                                                 
 bidirectional_10 (Bidirecti  (None, 128)              66048     
 onal)                                                           
                                                                 
 dense_20 (Dense)            (None, 64)                8256      
                                                                 
 dense_21 (Dense)            (None, 3)                 195       
                                                                 
Total params: 77,507
Trainable params: 77,507
Non-trai

In [None]:
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset, validation_steps=30)

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

## Почти закончил, не удалось додебажить этап подачи данных в нейросеть.