# $\textit{Predicción a nivel de palabra}$

In [12]:
import zipfile
import wget
from matplotlib import pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
import codecs

In [None]:
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"
filename = wget.download(url=url)
with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall(path="./")

In [3]:
with open("Sherlock.txt", 'r', encoding='utf-8') as file:
    book = file.read()

In [80]:
vocabSize = 5000
token = Tokenizer(num_words=vocabSize,
                  filters='!"#$%&()*+,-/:.;<=>?@[\\]^_`{|}~\t\n', lower=True,
                  split=' ', char_level=False, oov_token="UNK",
                  document_count=0)

token.fit_on_texts([book])

In [81]:
sequences = token.texts_to_sequences([book])[0]

In [82]:
def create_dataset(text, EOS, SOS, window=100):
  inputs = []
  outputs = []
  
  for i in range(len(text)-window):
    inputs.append([SOS] + text[i:i+window] + [EOS])
    outputs.append(text[i+1:i+window+1] + [EOS])
  return list(inputs), list(outputs)

In [83]:
window_size = 100
EOS = vocabSize #End of sequence
SOS = vocabSize + 1 #Start of sequence
SEP = vocabSize + 2 #Separation of sentences

X, y = create_dataset(sequences, EOS, SOS, window_size) #Creamos una secuencia de largo window, con el texto de referencia en X y el texto mas 1 dato mas en y

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, shuffle=True)

In [85]:
lengths=[len(sequences) for seq in X_train]
max_len=120
train_sequences=pad_sequences(X_train, maxlen=max_len)
test_sequences=pad_sequences(X_validation, maxlen=max_len)
train_sequences_y=pad_sequences(y_train, maxlen=max_len)
test_sequences_y=pad_sequences(y_validation, maxlen=max_len)
reverse_dictionary = token.index_word
dictionary = dict([(value, key) for (key, value) in reverse_dictionary.items()])

In [13]:
#load embeddings
EMBEDDING_DIR = "./"
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open(EMBEDDING_DIR+'wiki-news-300d-1M.vec', encoding='utf-8')
for line in f:
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))

loading word embeddings...
found 999995 word vectors


In [92]:
embed_dim=300
num_words=len(dictionary)+3
embedding_matrix=np.zeros([num_words, embed_dim])
for word, idx in dictionary.items():
  if idx <= num_words and word in embeddings_index:
    embedding_matrix[idx,:]=embeddings_index[word]

In [106]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Dense, Input, Concatenate, Dot, RepeatVector, TimeDistributed, Multiply, Lambda, Flatten, Activation, Reshape, BatchNormalization, LSTM
import tensorflow.keras.backend as K
from tensorflow.keras.activations import softmax
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import optimizers
from tensorflow.keras.utils import plot_model

timesteps = 100

model = Sequential()
model.add(Embedding(num_words, embed_dim, weights = [embedding_matrix], input_length=window_size, trainable=True))
model.add(LSTM(timesteps, return_sequences=False, name='LSTM1'))
model.add(Dense(vocabSize, activation='softmax'))

In [98]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop')

In [None]:
hist = model.fit(X_train, y_train, batch_size=256, epochs=20, verbose=1, validation_data = (X_validation, y_validation))