# Лабораторна робота 3: Знайомство з нейромережами

## 3. Рекурентні нейромережі

### Обробка та класифікація текстових даних

In [1]:
import pandas as pd

tweets = pd.read_csv("data\gender_tweets.csv")
tweets = tweets[["gender", "text"]]
tweets = tweets.loc[(tweets["gender"] == "male") | (tweets["gender"] == "female")]
tweets = tweets.dropna()

print(len(tweets.index))

12894


In [2]:
# from nltk import download
# download('stopwords')
from nltk.corpus import stopwords
import re
import string


stop_words = stopwords.words("english")

def clean_text(text):
    text = text.lower()
    text = re.sub("@\S+", " ", text)
    text = re.sub("https*\S+", " ", text)
    text = re.sub("#\S+", " ", text)
    text = re.sub("\d", " ", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    # text = re.sub('щч', ' ', text) 
    text = re.sub('\s{2,}', ' ', text)
    text = ' '.join([word for word in text.split(' ') if word not in stop_words])
    return text


texts = tweets["text"]
tweets["text"] = [clean_text(t) for t in texts]

tweets.head(10)

Unnamed: 0,gender,text
0,male,robbie e responds critics win eddie edwards
1,male,‰ыпit felt like friends living story them‰ыќ
2,male,absolutely adore louis starts songs hits hard ...
3,male,hi looking url use typically see advanced user
4,female,watching neighbours sky catching neighbs xxx щ...
5,female,ive seen people train lamps chairs tvs etc
7,male,gala bingo clubs bought еј uk largest high str...
8,female,pic defines mcd fangirls fanboys mcd shippers xd
9,female,lovely tree year never seen gorgeous
12,female,put ass line repay


In [13]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer

max_features = 10000 
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(tweets["text"])

tweets_token = tokenizer.texts_to_sequences(tweets["text"])

tweets_token_padd = keras.utils.pad_sequences(tweets_token, maxlen=20, padding='post', truncating='post')


In [20]:
from sklearn.model_selection import train_test_split

genders = tweets["gender"].map({"male": 1.0, "female": 0.0})
X_train, X_test, Y_train, Y_test = train_test_split(tweets_token_padd, genders, test_size=0.3)

In [57]:

model = keras.Sequential([
    layers.Embedding(max_features, 128),

    layers.Bidirectional(layers.LSTM(32, return_sequences=True)),
    layers.BatchNormalization(trainable=False),

    layers.Bidirectional(layers.LSTM(32, return_sequences=True)),
    layers.BatchNormalization(trainable=False),

    layers.LSTM(32),
    layers.BatchNormalization(trainable=False),

    layers.Dense(128, activation="relu"),
    layers.BatchNormalization(trainable=False),
    keras.layers.Dropout(rate=0.2),

    layers.Dense(1, activation="sigmoid")
])
model.summary()


model.compile(optimizer="nadam", loss="binary_crossentropy", metrics=["accuracy"])

In [58]:

history = model.fit(X_train,
                    Y_train,
                    epochs=10,
                    batch_size=1000,
                    validation_data=(X_test, Y_test),
                    shuffle = True,
                    verbose = 1)



Epoch 1/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 295ms/step - accuracy: 0.5114 - loss: 0.6929 - val_accuracy: 0.5276 - val_loss: 0.6913
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 193ms/step - accuracy: 0.5213 - loss: 0.6913 - val_accuracy: 0.5537 - val_loss: 0.6862
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 194ms/step - accuracy: 0.6231 - loss: 0.6649 - val_accuracy: 0.5658 - val_loss: 0.7035
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 195ms/step - accuracy: 0.6892 - loss: 0.6058 - val_accuracy: 0.5875 - val_loss: 0.7249
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 199ms/step - accuracy: 0.7907 - loss: 0.4714 - val_accuracy: 0.5797 - val_loss: 0.8011
Epoch 6/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 192ms/step - accuracy: 0.8423 - loss: 0.3930 - val_accuracy: 0.5828 - val_loss: 0.8522
Epoch 7/10
[1m10/10[0m [

In [59]:
model.summary()