<a href="https://colab.research.google.com/github/MattWroclaw/neural-networks/blob/main/07_rnn/02_text_classifier_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Przygotowanie środowiska do pracy z Tensorflow 2.0.
# Jeśli otrzymasz błąd podczas instalacji Tensorflow uruchom tę komórkę raz jeszcze.

# !pip uninstall -y tensorflow
# !pip install -q tensorflow==2.0.0

In [1]:
import numpy as np
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

In [2]:
!wget https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
!unzip -q reviews.zip

--2024-10-23 18:36:44--  https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.157.207, 142.251.8.207, 142.251.170.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.157.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42878657 (41M) [application/x-zip-compressed]
Saving to: ‘reviews.zip’


2024-10-23 18:36:51 (8.50 MB/s) - ‘reviews.zip’ saved [42878657/42878657]



In [3]:
data_dir = './reviews'
train_dir = os.path.join(data_dir, 'train')

train_texts = []
train_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            train_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                train_labels.append(0)
            else:
                train_labels.append(1)

In [4]:
test_dir = os.path.join(data_dir, 'test')

test_texts = []
test_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            test_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                test_labels.append(0)
            else:
                test_labels.append(1)

In [5]:
train_texts[:10]

['Bernard Rapp passed away last year and was a very cultured journalist. Cinema was one of his biggest passions (he penned a vast worldwide dictionary of films) and so he was bound to wield a camera at least one time in his life. But the films he left garnered lukewarm reviews: "Tiré à Part" (1996) in spite of Terence Stamp\'s sensational performance was very caricatured in the depiction of the characters, "une Affaire De Goût" (2000) was a slick affair even if Bernard Giraudeau delivered a perverse performance, "Pas Si Grave" (2003) was another let-down and "un Petit Jeu Sans Conséquence" is as underwhelming as its predecessors. Its comic potential is exploited in a flimsy way.<br /><br />And however, the starting idea let predict a twirling, spiritual comedy. A couple held by Yvan Attal and Sandrine Kiberlain who invited their friends is in full moving in a lascivious mansion. To play with their guests, they pretend to part company with each other. And things don\'t go as planned bec

In [6]:
train_labels[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [7]:
train_labels[-10:]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [8]:
maxlen = 100   # skracamy recenzje do 100 słów
num_words = 10000    # 10000 najczęściej pojawiających się słów
embedding_dim = 100

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_texts)

In [9]:
list(tokenizer.index_word.items())[:20]

[(1, 'the'),
 (2, 'and'),
 (3, 'a'),
 (4, 'of'),
 (5, 'to'),
 (6, 'is'),
 (7, 'br'),
 (8, 'in'),
 (9, 'it'),
 (10, 'i'),
 (11, 'this'),
 (12, 'that'),
 (13, 'was'),
 (14, 'as'),
 (15, 'for'),
 (16, 'with'),
 (17, 'movie'),
 (18, 'but'),
 (19, 'film'),
 (20, 'on')]

In [10]:
sequences = tokenizer.texts_to_sequences(train_texts)
print(sequences[:3])

[[5340, 2103, 242, 233, 288, 2, 13, 3, 52, 3930, 434, 13, 28, 4, 24, 1123, 9259, 26, 7693, 3, 4287, 9092, 4, 105, 2, 35, 26, 13, 2718, 5, 3, 367, 30, 219, 28, 55, 8, 24, 110, 18, 1, 105, 26, 314, 9457, 852, 170, 4138, 8, 2706, 4, 8422, 236, 13, 52, 8, 1, 2813, 4, 1, 102, 847, 3070, 13, 3, 4588, 1583, 57, 44, 5340, 2128, 3, 8288, 236, 2615, 3815, 13, 157, 384, 177, 2, 2753, 9458, 6, 14, 14, 91, 8289, 91, 695, 982, 6, 6792, 8, 3, 6436, 93, 7, 7, 2, 187, 1, 1853, 323, 384, 5674, 3, 3559, 209, 3, 375, 1425, 31, 2, 34, 5514, 65, 365, 6, 8, 366, 724, 8, 3, 3022, 5, 294, 16, 65, 5515, 33, 3931, 5, 170, 1166, 16, 254, 82, 2, 180, 89, 137, 14, 4252, 85, 1, 4, 65, 149, 862, 95, 1, 104, 1838, 377, 5, 41, 1, 4, 65, 375, 7, 7, 8, 2706, 4, 5456, 1381, 2, 1, 2425, 815, 26, 66, 30, 24, 937, 188, 1918, 5, 199, 110, 5, 11, 497, 16, 2070, 3606, 1, 111, 1157, 3, 70, 4716, 6437, 16, 102, 34, 25, 3366, 70, 570, 8423, 2, 5171, 12, 23, 41, 34, 33, 63, 23, 6887, 39, 901, 695, 299, 396, 804, 1032, 3, 75, 799, 7

Sprawdźmy jak to działa...

In [24]:
sequences = tokenizer.texts_to_sequences(train_texts)
print('Sequence 0: ' , sequences[0])
print('Długość sequence[0] = ' , len(sequences[0]))
print('Pierwsze zdanie z sequence: ' ,train_texts[0])
print('Ilość znaków w zdaniu pierwszym = ' ,len(train_texts[0]))

print('Pierwszy znak w train_texts = ' ,train_texts[0][0])
print('Pierwszy wyraz w zdaniu , index kolejności występowania = ' ,sequences[0][0])


Sequence 0:  [5340, 2103, 242, 233, 288, 2, 13, 3, 52, 3930, 434, 13, 28, 4, 24, 1123, 9259, 26, 7693, 3, 4287, 9092, 4, 105, 2, 35, 26, 13, 2718, 5, 3, 367, 30, 219, 28, 55, 8, 24, 110, 18, 1, 105, 26, 314, 9457, 852, 170, 4138, 8, 2706, 4, 8422, 236, 13, 52, 8, 1, 2813, 4, 1, 102, 847, 3070, 13, 3, 4588, 1583, 57, 44, 5340, 2128, 3, 8288, 236, 2615, 3815, 13, 157, 384, 177, 2, 2753, 9458, 6, 14, 14, 91, 8289, 91, 695, 982, 6, 6792, 8, 3, 6436, 93, 7, 7, 2, 187, 1, 1853, 323, 384, 5674, 3, 3559, 209, 3, 375, 1425, 31, 2, 34, 5514, 65, 365, 6, 8, 366, 724, 8, 3, 3022, 5, 294, 16, 65, 5515, 33, 3931, 5, 170, 1166, 16, 254, 82, 2, 180, 89, 137, 14, 4252, 85, 1, 4, 65, 149, 862, 95, 1, 104, 1838, 377, 5, 41, 1, 4, 65, 375, 7, 7, 8, 2706, 4, 5456, 1381, 2, 1, 2425, 815, 26, 66, 30, 24, 937, 188, 1918, 5, 199, 110, 5, 11, 497, 16, 2070, 3606, 1, 111, 1157, 3, 70, 4716, 6437, 16, 102, 34, 25, 3366, 70, 570, 8423, 2, 5171, 12, 23, 41, 34, 33, 63, 23, 6887, 39, 901, 695, 299, 396, 804, 1032, 3

In [26]:
def get_words_from_indexes(indexArr, tokenizer):
    # Pobranie słów odpowiadających indeksom z indexArr
    words = [tokenizer.index_word.get(index, None) for index in indexArr]
    return words

indexArr = [5340, 2103, 242, 233, 288, 2, 13, 3, 52, 3930, 434, 13]
print(get_words_from_indexes(indexArr, tokenizer))



['bernard', 'passed', 'away', 'last', 'year', 'and', 'was', 'a', 'very', 'journalist', 'cinema', 'was']


wracamy do tutoriala

In [27]:
word_index = tokenizer.word_index
print(f'{len(word_index)} unikatowych słów.')


88582 unikatowych słów.


In [28]:
# skracamy recenzje do pierwszych 100 słów
train_data = pad_sequences(sequences, maxlen=maxlen)
train_data.shape

(25000, 100)

In [29]:
train_data[:3]

array([[3606,    1,  111, 1157,    3,   70, 4716, 6437,   16,  102,   34,
          25, 3366,   70,  570, 8423,    2, 5171,   12,   23,   41,   34,
          33,   63,   23, 6887,   39,  901,  695,  299,  396,  804, 1032,
           3,   75,  799, 7799,    3,  114,   50,    1,   19,   16,   11,
          75, 6438,   36,    5, 6192,  602,  108,  841,   57,    1, 6793,
        4801,    8,    1, 5558,    6,   33,  303,    5,   27, 1095,    2,
           5,   65,   71,    5,  409,   95,  258,    1,  308,    6,  512,
        1056,    8,    3,  930,    7,    7,   42,    5,  132,    9, 5340,
         105,  112, 1448,   53,    5,   24, 2990,   14, 2753, 9458, 3309,
        2410],
       [   9,   66,  282,   12, 1702,  158,   88, 3712,    9, 2262,    1,
         526,   80,   91,  179,    2,   90,  175, 1774,   91, 2263,    2,
          93,    4,  109,    8,    3,   93,   12, 1740, 1702,   88,  404,
         119,   21,    7,    7,  471,   37,    5,   64,    3,  726, 1478,
         620,   15,  10

In [31]:
# chcę zobaczyć o co tu chodzi..
get_words_from_indexes(train_data[:3][0], tokenizer)

# wygląda że wypisuje słowa w kol. występowania

['consequences',
 'the',
 'plot',
 'follows',
 'a',
 'well',
 'worn',
 'pattern',
 'with',
 'characters',
 'who',
 'have',
 'specific',
 'well',
 'known',
 'functions',
 'and',
 'masks',
 'that',
 'are',
 'about',
 'who',
 'they',
 'really',
 'are',
 'verbal',
 'or',
 'situation',
 'comic',
 'effects',
 'often',
 'fall',
 'flat',
 'a',
 'bad',
 'editing',
 'fades',
 'a',
 'little',
 'more',
 'the',
 'film',
 'with',
 'this',
 'bad',
 'habit',
 'from',
 'to',
 'abruptly',
 'cut',
 'many',
 'sequences',
 'even',
 'the',
 "actors'",
 'sincere',
 'in',
 'the',
 'venture',
 'is',
 'they',
 'seem',
 'to',
 'be',
 'bored',
 'and',
 'to',
 'their',
 'than',
 'to',
 'live',
 'them',
 'especially',
 'the',
 'audience',
 'is',
 'soon',
 'caught',
 'in',
 'a',
 'deep',
 'br',
 'br',
 "it's",
 'to',
 'say',
 'it',
 'bernard',
 'films',
 'never',
 'lived',
 'up',
 'to',
 'his',
 'intentions',
 'as',
 'un',
 'sans',
 'bears',
 'witness']

In [32]:
train_labels = np.asarray(train_labels)
train_labels

array([0, 0, 0, ..., 1, 1, 1])

In [33]:
# przemieszanie próbek
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)
train_data = train_data[indices]
train_labels = train_labels[indices]

train_data.shape

(25000, 100)

In [34]:
# podział na zbiór treningowy i walidacyjny
training_samples = 15000
validation_samples = 10000

X_train = train_data[:training_samples]
y_train = train_labels[:training_samples]
X_val = train_data[training_samples: training_samples + validation_samples]
y_val = train_labels[training_samples: training_samples + validation_samples]

In [35]:
# budowa modelu
# Embedding(input_dim, output_dim)

model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()



In [36]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [37]:
history = model.fit(X_train,
                    y_train,
                    batch_size=32,
                    epochs=5,
                    validation_data=(X_val, y_val))

Epoch 1/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.6601 - loss: 0.5929 - val_accuracy: 0.8242 - val_loss: 0.3847
Epoch 2/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - accuracy: 0.9265 - loss: 0.2093 - val_accuracy: 0.8310 - val_loss: 0.3964
Epoch 3/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - accuracy: 0.9917 - loss: 0.0387 - val_accuracy: 0.8237 - val_loss: 0.5185
Epoch 4/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - accuracy: 0.9991 - loss: 0.0051 - val_accuracy: 0.8245 - val_loss: 0.6278
Epoch 5/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - accuracy: 1.0000 - loss: 5.1373e-04 - val_accuracy: 0.8240 - val_loss: 0.7095


In [38]:
def plot_hist(history):
    import pandas as pd
    import plotly.graph_objects as go
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['accuracy'], name='accuracy', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_accuracy'], name='val_accuracy', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='accuracy vs. val accuracy', xaxis_title='Epoki', yaxis_title='accuracy', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name='loss', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name='val_loss', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='loss vs. val loss', xaxis_title='Epoki', yaxis_title='loss', yaxis_type='log')
    fig.show()

plot_hist(history)

In [39]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=0)

[0.7247532606124878, 0.818120002746582]