In [25]:

import numpy as np
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Dropout

In [2]:
!wget https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
!unzip -q reviews.zip

--2024-04-25 11:45:26--  https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.74.207, 209.85.145.207, 172.217.219.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.74.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42878657 (41M) [application/x-zip-compressed]
Saving to: ‘reviews.zip’


2024-04-25 11:45:29 (18.6 MB/s) - ‘reviews.zip’ saved [42878657/42878657]



In [3]:

data_dir = './reviews'
train_dir = os.path.join(data_dir, 'train')

train_texts = []
train_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            train_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                train_labels.append(0)
            else:
                train_labels.append(1)

In [5]:
data_dir = './reviews'
test_dir = os.path.join(data_dir, 'test')

test_texts = []
test_labels = []

for label_type in ['pos', 'neg']:
  dir_name = os.path.join(test_dir, label_type)
  for fname in os.listdir(dir_name):
    if fname[-4:] == '.txt':
      f = open(os.path.join(dir_name, fname))
      test_texts.append(f.read())
      f.close()
      if label_type == 'neg':
        test_labels.append(0)
      else:
        test_labels.append(1)


In [6]:
train_texts[:10]

['First, a little summary. This reporter named Torch is basically trying to get out the story of a zombie outbreak and finds the military & government censoring him. Nice message, government censorship and all that, but the way they DID the movie was, well let me explain.<br /><br />This movie is beyond description. The idea that somebody holds it in higher regard than anything by George Romero is justification enough for the reviewer to be committed to a mental institution. The script is atrocious on its own, like it was written by a sixth grader.As for special effects, I understand that independent films have low budgets, and some gore effects looked acceptable, but if you want a scene with fire, here\'s a tip: buy some nonflammable material, have an extinguisher ready, and get a fire going! Don\'t digitally add it in and make it look like an explosion from a Nintendo 64 game. The acting, well let\'s put it this way. In my summer theater program, a cold reading of the script is, comp

In [7]:

train_labels[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [8]:

train_labels[-10:]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [9]:
maxlen = 100   # skracamy recenzje do 100 słów
num_words = 10000    # 10000 najczęściej pojawiających się słów
embedding_dim = 100

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_texts)

In [10]:
list(tokenizer.index_word.items())[:20]

[(1, 'the'),
 (2, 'and'),
 (3, 'a'),
 (4, 'of'),
 (5, 'to'),
 (6, 'is'),
 (7, 'br'),
 (8, 'in'),
 (9, 'it'),
 (10, 'i'),
 (11, 'this'),
 (12, 'that'),
 (13, 'was'),
 (14, 'as'),
 (15, 'for'),
 (16, 'with'),
 (17, 'movie'),
 (18, 'but'),
 (19, 'film'),
 (20, 'on')]

In [11]:
sequences = tokenizer.texts_to_sequences(train_texts)
print(sequences[:3])

[[83, 3, 114, 2718, 11, 2419, 769, 8926, 6, 688, 266, 5, 76, 43, 1, 62, 4, 3, 862, 2, 655, 1, 1245, 1365, 87, 324, 746, 1365, 7916, 2, 29, 12, 18, 1, 93, 33, 119, 1, 17, 13, 70, 384, 69, 1257, 7, 7, 11, 17, 6, 721, 2781, 1, 323, 12, 1838, 1773, 9, 8, 1927, 2888, 71, 229, 31, 739, 5034, 6, 8422, 192, 15, 1, 2211, 5, 27, 2525, 5, 3, 1747, 5844, 1, 226, 6, 2521, 20, 91, 202, 37, 9, 13, 395, 31, 3, 6702, 14, 15, 315, 299, 10, 388, 12, 1720, 105, 25, 361, 6436, 2, 46, 596, 299, 605, 3462, 18, 44, 22, 178, 3, 133, 16, 964, 1972, 3, 5558, 815, 46, 816, 25, 32, 1618, 2, 76, 3, 964, 167, 89, 758, 9, 8, 2, 94, 9, 165, 37, 32, 3930, 36, 3, 9878, 497, 1, 113, 70, 900, 273, 9, 11, 93, 8, 58, 1500, 747, 2078, 3, 1040, 883, 4, 1, 226, 6, 1076, 5, 11, 1, 3513, 10, 525, 57, 137, 80, 1, 8288, 166, 95, 620, 7, 7, 48, 4010, 69, 1, 88, 148, 13, 51, 282, 13, 1763, 100, 1201, 2, 799, 1838, 235, 25, 298, 860, 11, 269, 49, 900, 763, 9, 9, 3285, 3, 6703, 177, 58, 6192, 5, 57, 101, 41, 9, 5, 101, 1838, 417, 12, 

In [12]:
word_index = tokenizer.word_index
print(f'{len(word_index)} unikatowych słów.')

88582 unikatowych słów.


In [14]:

# skracamy recenzje do pierwszych 100 słów
train_data = pad_sequences(sequences, maxlen=maxlen)
train_data.shape

(25000, 100)

In [15]:
train_data[:3]


array([[ 158,   73,  456,   15,   11,   11,   19,    6,    1,  682,   88,
         391,   17,   10,   25,  107,   10,   25,   21,  107,  229,   31,
        1656, 2130,   18,   10,   25, 4449,   11,    6,  430,   44,   22,
          23,  264,   15,  618,  434,   35,   73,   14,  109,  742,  744,
        2186,    4,    9,   77,  239,  199,   22,    3,   75, 6193,   44,
          21,   10,  128,  383,   12,   22, 1271,  898,    1,  164,    2,
         939,   86,   26, 5975,   30,  311,  187,   44,   22,   23,    1,
         240,    4,  411,   34,   76,    3,  459,   43,    4,   63,   75,
         535,   92,   10,  383,   22,  804,   11,   43,   22,  525,   27,
         683],
       [   4,   32, 4761,   30,    3, 7477, 2766,    4,  659,   36, 1588,
          39, 4416, 7799,    1,  837,  129,   11,   19,   45,   54, 1351,
        1284,   39, 1794,   26,  102,   20,    1,  265,    4,  403, 8595,
         258,   72,  847,  161,   41,    1,  280, 1481,    4,   11,  524,
        5846, 3475,  84

In [16]:
train_labels = np.asarray(train_labels)
train_labels

array([0, 0, 0, ..., 1, 1, 1])

In [17]:
# przemieszanie próbek
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)
train_data = train_data[indices]
train_labels = train_labels[indices]

train_data.shape

(25000, 100)

In [18]:

# podział na zbiór treningowy i walidacyjny
training_samples = 15000
validation_samples = 10000

X_train = train_data[:training_samples]
y_train = train_labels[:training_samples]
X_val = train_data[training_samples: training_samples + validation_samples]
y_val = train_labels[training_samples: training_samples + validation_samples]


In [26]:
# budowa modelu
# Embedding(input_dim, output_dim)

model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 flatten_2 (Flatten)         (None, 10000)             0         
                                                                 
 dense_3 (Dense)             (None, 16)                160016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1160033 (4.43 MB)
Trainable params: 1160033 (4.43 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [28]:
history = model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [29]:

def plot_hist(history):
    import pandas as pd
    import plotly.graph_objects as go
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['accuracy'], name='accuracy', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_accuracy'], name='val_accuracy', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='accuracy vs. val accuracy', xaxis_title='Epoki', yaxis_title='accuracy', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name='loss', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name='val_loss', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='loss vs. val loss', xaxis_title='Epoki', yaxis_title='loss', yaxis_type='log')
    fig.show()

plot_hist(history)

In [23]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=0)

[0.7064301371574402, 0.8259599804878235]

In [30]:
# WARSTWY REKURENCYJNE W SIECIACH NEURONOWYCH


from tensorflow.keras.layers import SimpleRNN, LSTM


In [31]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 32)          320000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 16)                784       
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320801 (1.22 MB)
Trainable params: 320801 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [32]:

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [33]:
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
plot_hist(history)

In [35]:

model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 32)          320000    
                                                                 
 lstm (LSTM)                 (None, 16)                3136      
                                                                 
 dense_6 (Dense)             (None, 1)                 17        
                                                                 
Total params: 323153 (1.23 MB)
Trainable params: 323153 (1.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [36]:

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [40]:
history = model.fit(X_train, y_train, batch_size=32, epochs=4, validation_data=(X_val, y_val))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [41]:

plot_hist(history)

In [42]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=0)

[0.5813878774642944, 0.8264399766921997]