In [33]:
import numpy as np
import re
import nltk
from collections import Counter
import spacy
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
import warnings

warnings.filterwarnings("ignore")
nlp = spacy.load("en_core_web_sm")

In [2]:
with open("./shakespeare.txt") as f:
    data = f.read()

In [3]:
data.split("\n")[:2]

['O for a Muse of fire, that would ascend',
 'The brightest heaven of invention,']

In [4]:
data_sub = re.sub(r"[,!?;-]", '.', data)
doc = nlp(data_sub)
data_token = [token.text.lower() for token in doc if token.is_alpha or token.text == '.']
print("Number of tokens:", len(data_token),'\n', "First 15 tokens:", data_token[:15])

Number of tokens: 60791 
 First 15 tokens: ['o', 'for', 'a', 'muse', 'of', 'fire', '.', 'that', 'would', 'ascend', 'the', 'brightest', 'heaven', 'of', 'invention']


In [5]:
fdist = nltk.FreqDist(word for word in data_token)
print("Size of vocabulary: ",len(fdist) )
print("Most frequent tokens: ", fdist.most_common(20) )

Size of vocabulary:  5659
Most frequent tokens:  [('.', 9626), ('the', 1522), ('and', 1394), ('i', 1246), ('to', 1159), ('of', 1093), ('my', 857), ('that', 782), ('in', 772), ('a', 750), ('you', 749), ('not', 560), ('is', 546), ('for', 467), ('it', 460), ('with', 441), ('his', 434), ('but', 417), ('me', 417), ('your', 397)]


In [6]:
word_counts = Counter([word for word in data_token])
vocab = list(word_counts.keys())
vocab_size = len(vocab)
vocab_size

5659

In [7]:
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for i, word in enumerate(vocab)}

In [8]:
word_to_index['my'], index_to_word[177]

(177, 'my')

In [9]:
index_to_word[2]

'a'

In [10]:
2, len(data_token) - 2

(2, 60789)

In [11]:
def get_windowa(words, C):
  i = C
  while i < len(words) - C:
    center_word = words[i]
    context_words = words[i-C:i] + words[(i+1):(i+C+1)]
    yield context_words, center_word
    i += 1

In [12]:
for x, y in get_windowa(data_token[:8], 2):
  print(x, y)

['o', 'for', 'muse', 'of'] a
['for', 'a', 'of', 'fire'] muse
['a', 'muse', 'fire', '.'] of
['muse', 'of', '.', 'that'] fire


In [13]:
len(np.zeros(vocab_size)) == vocab_size

True

In [14]:
def word_to_one_hot_vector(word, word_to_index, vocab_size):
  one_hot_vector = np.zeros(vocab_size)
  one_hot_vector[word_to_index[word]] = 1
  return one_hot_vector

In [15]:
word_to_one_hot_vector('a', word_to_index, vocab_size)

array([0., 0., 1., ..., 0., 0., 0.])

In [16]:
context_word_eg = ['o', 'for', 'muse', 'of']
context_word_eg_vectors = [word_to_one_hot_vector(w, word_to_index, vocab_size) for w in context_word_eg]
context_word_eg_vectors

[array([1., 0., 0., ..., 0., 0., 0.]),
 array([0., 1., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.])]

In [17]:
np.mean(context_word_eg_vectors, axis=0)

array([0.25, 0.25, 0.  , ..., 0.  , 0.  , 0.  ])

In [18]:
def context_words_to_vector(context_words, word_to_index, vocab_size):
  context_words_vectors = [word_to_one_hot_vector(w, word_to_index, vocab_size) for w in context_words]
  context_words_vectors = np.mean(context_words_vectors, axis=0)
  return context_words_vectors

In [19]:
context_words_to_vector(context_word_eg, word_to_index, vocab_size)

array([0.25, 0.25, 0.  , ..., 0.  , 0.  , 0.  ])

In [20]:
def get_training_vectors(words, C, word_to_index, vocab_size):
  for context_words, center_word in get_windowa(words, C):
    yield context_words_to_vector(context_words, word_to_index, vocab_size), word_to_one_hot_vector(center_word, word_to_index, vocab_size)

In [21]:
for context_words_vector, center_word_vector in get_training_vectors(data_token[:8], 2, word_to_index, vocab_size):
  print(f'Context words vector:  {context_words_vector}')
  print(f'Center word vector:  {center_word_vector}')
  print()

Context words vector:  [0.25 0.25 0.   ... 0.   0.   0.  ]
Center word vector:  [0. 0. 1. ... 0. 0. 0.]

Context words vector:  [0.   0.25 0.25 ... 0.   0.   0.  ]
Center word vector:  [0. 0. 0. ... 0. 0. 0.]

Context words vector:  [0.   0.   0.25 ... 0.   0.   0.  ]
Center word vector:  [0. 0. 0. ... 0. 0. 0.]

Context words vector:  [0. 0. 0. ... 0. 0. 0.]
Center word vector:  [0. 0. 0. ... 0. 0. 0.]



In [22]:
def get_training_data(words, C, word_to_index, vocab_size):
  X = []
  y = []

  for context_words, center_word in get_training_vectors(words, C, word_to_index, vocab_size):
    X.append(context_words)
    y.append(center_word)
  return np.array(X), np.array(y)

In [23]:
X, y = get_training_data(data_token, 2, word_to_index, vocab_size)

In [24]:
print("Shape of X (input features):", X.shape)
print("Shape of Y (output labels):", y.shape)

Shape of X (input features): (60787, 5659)
Shape of Y (output labels): (60787, 5659)


In [29]:
np.save("X_vectors.npy", X)
np.save("y_vectors.npy", y)

print("Training data saved!")

Training data saved!


In [34]:
embedding_dim = 100  
model = Sequential([
    Dense(embedding_dim, input_shape = (X.shape[1], ), activation='relu'),
    Dense(y.shape[1], activation='softmax')
])

In [35]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X, y, epochs=10, batch_size=32, verbose=1)

2025-01-24 11:58:04.798150: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1375974532 exceeds 10% of free system memory.
