In [29]:
# Importing the dataset
file_path = 'text.txt'  # Assuming 'text.txt' is in the current directory

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        text_data = file.read()
except FileNotFoundError:
    print(f"'{file_path}' not found in the current directory.")
except Exception as e:
    print(f"An error occurred: {str(e)}")


In [30]:

import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

# Tokenize into sentences
sentences = sent_tokenize(text_data)

import string

# Removing punctuation and converting to lowercase
sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\team6\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:


from nltk.tokenize import word_tokenize
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

# Print the tokenized sentences
for tokens in tokenized_sentences:
    print(tokens)


['chapter', 'i', 'a', 'longexpected', 'party', 'when', 'mr', 'bilbo', 'baggins', 'of', 'bag', 'end', 'announced', 'that', 'he', 'would', 'shortly', 'be', 'celebrating', 'his', 'eleventyfirst', 'birthday', 'with', 'a', 'party', 'of', 'special', 'magnificence', 'there', 'was', 'much', 'talk', 'and', 'excitement', 'in', 'hobbiton']
['bilbo', 'was', 'very', 'rich', 'and', 'very', 'peculiar', 'and', 'had', 'been', 'the', 'wonder', 'of', 'the', 'shire', 'for', 'sixty', 'years', 'ever', 'since', 'his', 'remarkable', 'disappearance', 'and', 'unexpected', 'return']
['the', 'riches', 'he', 'had', 'brought', 'back', 'from', 'his', 'travels', 'had', 'now', 'become', 'a', 'local', 'legend', 'and', 'it', 'was', 'popularly', 'believed', 'whatever', 'the', 'old', 'folk', 'might', 'say', 'that', 'the', 'hill', 'at', 'bag', 'end', 'was', 'full', 'of', 'tunnels', 'stuffed', 'with', 'treasure']
['and', 'if', 'that', 'was', 'not', 'enough', 'for', 'fame', 'there', 'was', 'also', 'his', 'prolonged', 'vigour

In [33]:
# Build a Vocabulary
vocabulary = set()  # Initialize an empty set to store unique words

# Iterate through tokenized sentences and add words to the vocabulary
for tokens in tokenized_sentences:
    vocabulary.update(tokens)

# Convert the set to a sorted list for consistency
vocabulary = sorted(list(vocabulary))

# Print the vocabulary
print("Vocabulary:")
print(vocabulary)


Vocabulary:


In [36]:
# Create a word-to-integer mapping
word_to_int = {word: idx for idx, word in enumerate(vocabulary)}

# Add special tokens
word_to_int['<START>'] = len(word_to_int)
word_to_int['<END>'] = len(word_to_int)
word_to_int['<PAD>'] = len(word_to_int)

# Add a placeholder for unknown words
word_to_int['<UNK>'] = len(word_to_int)

# Convert words to integers in tokenized sentences
int_sequences = []
for sentence_tokens in tokenized_sentences:
    int_sequence = [word_to_int.get(word, word_to_int['<UNK>']) for word in sentence_tokens]
    int_sequences.append(int_sequence)


In [37]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set a maximum sequence length (e.g., 50)
max_sequence_length = 50

# Pad the sequences
padded_sequences = pad_sequences(int_sequences, maxlen=max_sequence_length, padding='post', truncating='post', value=word_to_int['<PAD>'])


In [38]:
from sklearn.model_selection import train_test_split

# Split the data into training, validation, and test sets
train_ratio = 0.7  # 70% for training
val_ratio = 0.15   # 15% for validation
test_ratio = 0.15  # 15% for testing

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, padded_sequences, test_size=1 - train_ratio, random_state=42)

# Split the remaining data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_ratio / (test_ratio + val_ratio), random_state=42)

# Print the shapes of the split datasets
print("Training data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)
print("Test data shape:", X_test.shape)


Training data shape: (7532, 50)
Validation data shape: (1614, 50)
Test data shape: (1615, 50)


In [40]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Masking

# Define the maximum sequence length (you can set this to your preferred value)
max_sequence_length = 50

# Create the Sequential model
model = Sequential()

# Embedding layer with 100-dimensional vectors and input length set to the maximum sequence length
embedding_layer = Embedding(input_dim=len(vocabulary), output_dim=100, input_length=max_sequence_length, trainable=False)

# Masking layer to ignore padding
model.add(Masking(mask_value=word_to_int['<PAD>']))

# Add the embedding layer
model.add(embedding_layer)

# LSTM layer with dropout
model.add(LSTM(100, dropout=0.2))

# Fully connected Dense layer with ReLU activation
model.add(Dense(100, activation='relu'))

# Dropout layer to prevent overfitting
model.add(Dropout(0.2))

# Output layer with a Dense layer and softmax activation
model.add(Dense(len(vocabulary), activation='softmax'))

# Build the model
model.build(input_shape=(None, max_sequence_length))  # Specify the input shape

# Compile the model with the Adam optimizer
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_1 (Masking)         (None, 50)                0         
                                                                 
 embedding_1 (Embedding)     (None, 50, 100)           958700    
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense_2 (Dense)             (None, 100)               10100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_3 (Dense)             (None, 9587)              968287    
                                                                 
Total params: 2017487 (7.70 MB)
Trainable params: 1058

In [42]:
from tensorflow.keras.utils import to_categorical

# Reshape and one-hot encode the target data
y_train_onehot = to_categorical(y_train, num_classes=len(vocabulary))
y_val_onehot = to_categorical(y_val, num_classes=len(vocabulary))
y_test_onehot = to_categorical(y_test, num_classes=len(vocabulary))


# Define the number of training epochs and batch size
epochs = 10
batch_size = 32

# Train the model on the training data
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

# Evaluate the model on the validation dataset
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss}, Validation Accuracy: {accuracy}')

# Evaluate the model on the test dataset
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')


IndexError: index 9589 is out of bounds for axis 1 with size 9587