In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.layers import SimpleRNN
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## Подготовка датасета

In [12]:
data = pd.read_csv("file.csv", sep=',')
x = data['Lyric']
y = data['language']
encoder = LabelEncoder()
y = encoder.fit_transform(y)
data = pd.DataFrame(x)
data['language'] = y
print(data)
train, test = train_test_split(data, test_size=0.2, random_state=42)

                                                 Lyric  language
0    Tudo o que eu quero nessa vida,\r\nToda vida, ...         2
1    Meu coração\r\nSem direção\r\nVoando só por vo...         2
2    É de babaixá!\r\nÉ de balacubaca!\r\nÉ de baba...         2
3    Quando a chuva passar\r\n\r\nPra quê falar\r\n...         2
4    A minha sorte grande foi você cair do céu\r\nM...         2
..                                                 ...       ...
125  Que falta eu sinto de um bem\r\nQue falta me f...         2
126  Eu tô vendo\r\nVocê tá balançando\r\nTô sentin...         2
127  Olha quem chegou\r\nNesse calor eu sei que vou...         2
128  Meu amor olha só hoje o sol não apareceu\r\nÉ ...         2
129  Faço qualquer negócio pra te ver na avenida ou...         2

[130 rows x 2 columns]


In [13]:
x = train["Lyric"]
y = train["language"]
x = np.asarray(x).astype(np.str_)
y = np.asarray(y).astype(np.float32)
test_x = test["Lyric"]
test_y = test["language"]
test_x = np.asarray(test_x).astype(np.str_)
test_y = np.asarray(test_y).astype(np.float32)

## Посимвольная векторизация

In [14]:
vectorizer = tf.keras.layers.TextVectorization(split='character')

print('Training vectorizer')
vectorizer.adapt(list(train["Lyric"]))

Training vectorizer


## SimpleRNN модель

In [15]:
model = keras.Sequential()
model.add(vectorizer)
model.add(layers.Embedding(input_dim=100000, output_dim=8))
model.add(layers.SimpleRNN(4))
model.add(layers.Dense(100, activation = 'relu'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, None, 8)           800000    
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 4)                 52        
                                                                 
 dense_2 (Dense)             (None, 100)               500       
                                                                 
Total params: 800,552
Trainable params: 800,552
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(keras.optimizers.Adam(0.01), keras.losses.SparseCategoricalCrossentropy(from_logits=True), ['acc'])
train_model = model.fit(x=x, y=y, batch_size=64, epochs = 10, validation_data=(test_x, test_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Пословная токенизация

In [17]:
vectorizer = tf.keras.layers.TextVectorization(split='whitespace')

print('Training vectorizer')
vectorizer.adapt(list(train["Lyric"]))

Training vectorizer


## SimpleRNN модель

In [18]:
model = keras.Sequential()
model.add(vectorizer)
model.add(layers.Embedding(input_dim=100000, output_dim=64, mask_zero=True))
model.add(layers.SimpleRNN(4))
model.add(layers.Dense(100, activation = 'tanh'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_4 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_3 (Embedding)     (None, None, 64)          6400000   
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 4)                 276       
                                                                 
 dense_3 (Dense)             (None, 100)               500       
                                                                 
Total params: 6,400,776
Trainable params: 6,400,776
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(keras.optimizers.Adam(0.01), keras.losses.SparseCategoricalCrossentropy(from_logits=True), ['acc'])
train_model = model.fit(x=x, y=y, batch_size=64, epochs = 10, validation_data=(test_x, test_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## LSTM

## Подготовка датасета

In [20]:
data = pd.read_csv("file.csv")
x = data['Lyric']
y = data['language']
encoder = LabelEncoder()
y = encoder.fit_transform(y)
data = pd.DataFrame(x)
data['language'] = y
print(data)
train, test = train_test_split(data, test_size=0.2, random_state=42)

                                                 Lyric  language
0    Tudo o que eu quero nessa vida,\r\nToda vida, ...         2
1    Meu coração\r\nSem direção\r\nVoando só por vo...         2
2    É de babaixá!\r\nÉ de balacubaca!\r\nÉ de baba...         2
3    Quando a chuva passar\r\n\r\nPra quê falar\r\n...         2
4    A minha sorte grande foi você cair do céu\r\nM...         2
..                                                 ...       ...
125  Que falta eu sinto de um bem\r\nQue falta me f...         2
126  Eu tô vendo\r\nVocê tá balançando\r\nTô sentin...         2
127  Olha quem chegou\r\nNesse calor eu sei que vou...         2
128  Meu amor olha só hoje o sol não apareceu\r\nÉ ...         2
129  Faço qualquer negócio pra te ver na avenida ou...         2

[130 rows x 2 columns]


In [21]:
x = train["Lyric"]
y = train["language"]
x = np.asarray(x).astype(np.str_)
y = np.asarray(y).astype(np.float32)
test_x = test["Lyric"]
test_y = test["language"]
test_x = np.asarray(test_x).astype(np.str_)
test_y = np.asarray(test_y).astype(np.float32)

## Посимвольная векторизация

In [22]:
vectorizer = tf.keras.layers.TextVectorization(split='character')

print('Training vectorizer')
vectorizer.adapt(list(train["Lyric"]))

Training vectorizer


## Однослойная LSTM-модель

In [23]:
model = keras.Sequential()
model.add(vectorizer)
model.add(layers.Embedding(input_dim=40000, output_dim=8, mask_zero=True))
model.add(layers.LSTM(4))
model.add(layers.Dense(100, activation = 'tanh'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_5 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_4 (Embedding)     (None, None, 8)           320000    
                                                                 
 lstm (LSTM)                 (None, 4)                 208       
                                                                 
 dense_4 (Dense)             (None, 100)               500       
                                                                 
Total params: 320,708
Trainable params: 320,708
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(keras.optimizers.Adam(0.1), keras.losses.SparseCategoricalCrossentropy(from_logits=True), ['acc'])
train_model = model.fit(x=x, y=y, batch_size=64, epochs = 10, validation_data=(test_x, test_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Пословная векторизация

In [27]:
vectorizer = tf.keras.layers.TextVectorization(split='whitespace')

print('Training vectorizer')
vectorizer.adapt(list(train["Lyric"]))

Training vectorizer


## Многослойная LSTM-модель

In [28]:
model = keras.Sequential()
model.add(vectorizer)
model.add(layers.Embedding(input_dim=40000, output_dim=64, mask_zero=True))
model.add(layers.LSTM(8, return_sequences=True))
model.add(layers.LSTM(4))
model.add(layers.Dense(100, activation = 'tanh'))
model.add(layers.Dense(100, activation = 'tanh'))
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_8 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_5 (Embedding)     (None, None, 64)          2560000   
                                                                 
 lstm_1 (LSTM)               (None, None, 8)           2336      
                                                                 
 lstm_2 (LSTM)               (None, 4)                 208       
                                                                 
 dense_5 (Dense)             (None, 100)               500       
                                                                 
 dense_6 (Dense)             (None, 100)               10100     
                                                      

In [29]:
model.compile(keras.optimizers.Adam(0.01), keras.losses.SparseCategoricalCrossentropy(from_logits=True), ['acc'])
train_model = model.fit(x=x, y=y, batch_size=64, epochs = 10, validation_data=(test_x, test_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Пословная векторизация

In [30]:
vectorizer = tf.keras.layers.TextVectorization(split='whitespace')

print('Training vectorizer')
vectorizer.adapt(list(train["Lyric"]))

Training vectorizer


## Двунаправленая LSTM-модель

In [31]:
model = keras.Sequential()
model.add(vectorizer)
model.add(layers.Embedding(input_dim=40000, output_dim=16, mask_zero=True))
model.add(layers.Bidirectional(keras.layers.LSTM(8), merge_mode='concat')),
model.add(layers.Dense(100, activation = 'tanh'))
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_9 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_6 (Embedding)     (None, None, 16)          640000    
                                                                 
 bidirectional (Bidirectiona  (None, 16)               1600      
 l)                                                              
                                                                 
 dense_7 (Dense)             (None, 100)               1700      
                                                                 
Total params: 643,300
Trainable params: 643,300
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.compile(keras.optimizers.Adam(0.01), keras.losses.SparseCategoricalCrossentropy(from_logits=True), ['acc'])
train_model = model.fit(x=x, y=y, batch_size=64, epochs = 20, validation_data=(test_x, test_y))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
