تحميل المكتبات المطلوبة

In [1]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
with open("/content/Franknestein.txt", encoding="utf8") as text_file:
    file = text_file.read()
# print(file)

In [3]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [4]:
processed_inputs = tokenize_words(file)

In [5]:
len(processed_inputs)

269995

In [6]:
processed_inputs[:1000]

'project gutenberg frankenstein mary wollstonecraft godwin shelley ebook use anyone anywhere cost almost restrictions whatsoever may copy give away use terms project gutenberg license included ebook online www gutenberg net title frankenstein modern prometheus author mary wollstonecraft godwin shelley release date june 17 2008 ebook 84 last updated january 13 2018 language english character set encoding utf 8 start project gutenberg ebook frankenstein produced judith boss christy phillips lynn hanninen david meltzer html version al haines corrections menno de leeuw frankenstein modern prometheus mary wollstonecraft godwin shelley contents letter 1 letter 2 letter 3 letter 4 chapter 1 chapter 2 chapter 3 chapter 4 chapter 5 chapter 6 chapter 7 chapter 8 chapter 9 chapter 10 chapter 11 chapter 12 chapter 13 chapter 14 chapter 15 chapter 16 chapter 17 chapter 18 chapter 19 chapter 20 chapter 21 chapter 22 chapter 23 chapter 24 letter 1 _to mrs saville england _ st petersburgh dec 11th 17 

In [7]:
chars = sorted(list(set(processed_inputs)))

In [8]:
len(chars)

43

In [9]:
chars[:15]

[' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_', 'a', 'b', 'c']

In [10]:
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [11]:
char_to_num

{' ': 0,
 '0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10,
 '_': 11,
 'a': 12,
 'b': 13,
 'c': 14,
 'd': 15,
 'e': 16,
 'f': 17,
 'g': 18,
 'h': 19,
 'i': 20,
 'j': 21,
 'k': 22,
 'l': 23,
 'm': 24,
 'n': 25,
 'o': 26,
 'p': 27,
 'q': 28,
 'r': 29,
 's': 30,
 't': 31,
 'u': 32,
 'v': 33,
 'w': 34,
 'x': 35,
 'y': 36,
 'z': 37,
 'æ': 38,
 'è': 39,
 'é': 40,
 'ê': 41,
 'ô': 42}

In [12]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 269995
Total vocab: 43


In [13]:
seq_length = 100
x_data = []
y_data = []

In [14]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [15]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 269895


In [16]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [17]:
y = np_utils.to_categorical(y_data)

In [18]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [19]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [21]:
model.fit(X, y, epochs=20, batch_size=256, callbacks=desired_callbacks)

Epoch 1/20
Epoch 00001: loss improved from inf to 2.90142, saving model to model_weights_saved.hdf5
Epoch 2/20
Epoch 00002: loss improved from 2.90142 to 2.63526, saving model to model_weights_saved.hdf5
Epoch 3/20
Epoch 00003: loss improved from 2.63526 to 2.48181, saving model to model_weights_saved.hdf5
Epoch 4/20
Epoch 00004: loss improved from 2.48181 to 2.35490, saving model to model_weights_saved.hdf5
Epoch 5/20
Epoch 00005: loss improved from 2.35490 to 2.25474, saving model to model_weights_saved.hdf5
Epoch 6/20
Epoch 00006: loss improved from 2.25474 to 2.17710, saving model to model_weights_saved.hdf5
Epoch 7/20
Epoch 00007: loss improved from 2.17710 to 2.11113, saving model to model_weights_saved.hdf5
Epoch 8/20
Epoch 00008: loss improved from 2.11113 to 2.05683, saving model to model_weights_saved.hdf5
Epoch 9/20
Epoch 00009: loss improved from 2.05683 to 2.01346, saving model to model_weights_saved.hdf5
Epoch 10/20
Epoch 00010: loss improved from 2.01346 to 1.97693, savi

<tensorflow.python.keras.callbacks.History at 0x7ff9326c86a0>

In [22]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [23]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [24]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" old man desire left alone cottage children departed took guitar played several mournful sweet airs s "


In [25]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

ea sears sea sears sea sears sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea sea