<a href="https://colab.research.google.com/github/JanaBasha/n-gram-nn/blob/main/JanaBashaNLPa2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [None]:
import pandas as pd
import numpy as np
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Flatten, Dense

In [None]:
dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [None]:
#putting training testing and validation parts of the dataset dictionary into variables
training=dataset['train']['text']
validation=dataset['validation']['text']
testing=dataset['test']['text']

In [None]:
len(training)

36718

In [None]:
len(validation)

3760

In [None]:
len(testing)

4358

In [None]:
#creating a tokenizer object
tokn=Tokenizer(num_words=1000, oov_token='<unk>')

In [None]:
tr=tokn.fit_on_texts(training)

In [None]:
#convert the tokens to sequences of integers
tr_seq = tokn.texts_to_sequences(training)
ts_seq = tokn.texts_to_sequences(testing)
val_seq = tokn.texts_to_sequences(validation)

In [None]:
tokn.word_index

In [None]:
#I made a function to splt training testing and validation into x and y
#since it is 4gram model, x is three words and y is the forth
def split_into_x_y(sequences):
    X, y = [], []
    for seq in sequences:
        if len(seq) < 4:
            continue
        for i in range(3, len(seq)):
            X.append(seq[i-3:i])
            y.append(seq[i])
    return np.array(X), np.array(y)

X_train, y_train = split_into_x_y(tr_seq)
X_val, y_val = split_into_x_y(val_seq)
X_test, y_test = split_into_x_y(ts_seq)

In [None]:
len(X_train)

1696242

In [None]:
#The code kept crashing when i used all of the vocabulary so i limited the number of sequences in training to 50000
max_sequences = 50000
X_train, y_train = X_train[:max_sequences], y_train[:max_sequences]
len(X_train), len(y_train)

(50000, 50000)

In [None]:
maxlen=max(len(seq) for seq in tr_seq)
maxlen

632

In [None]:
#padding the training validation and testing sequestes post and truncating post
#max length is 3 because input will be three in the four gram model
tr_pad=pad_sequences(X_train, maxlen=3, padding='post', truncating='post')
val_pad=pad_sequences(X_val, maxlen=3, padding='post', truncating='post')
ts_pad=pad_sequences(X_test, maxlen=3, padding='post', truncating='post')

In [None]:
v=len(tokn.word_index)
v

66007

In [None]:
#the model is an embedding layer input is 1000 also because the code kept crashing so i had to reduce it and not use v
#simple RNN layer and dense layer
#i used softmax activation because it shows the probabilities of the next word
neural_network=Sequential([Embedding(input_dim=1000, output_dim=200, input_length=3),
                           SimpleRNN(units=200, return_sequences=False),
                           Dense(units=1000, activation='softmax')])
neural_network.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
neural_network.summary()

In [None]:
#I fit the model to the data i have which is the training pad and y train
#Validating it using val_pad and y_val
neural_network.fit(tr_pad, y_train, epochs=10, batch_size=128, validation_data=(val_pad, y_val))

Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 49ms/step - accuracy: 0.3766 - loss: 3.5156 - val_accuracy: 0.3549 - val_loss: 3.9354
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 74ms/step - accuracy: 0.3859 - loss: 3.3674 - val_accuracy: 0.3560 - val_loss: 3.9543
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 52ms/step - accuracy: 0.3868 - loss: 3.2927 - val_accuracy: 0.3518 - val_loss: 3.9564
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 72ms/step - accuracy: 0.3909 - loss: 3.2055 - val_accuracy: 0.3521 - val_loss: 3.9743
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 74ms/step - accuracy: 0.3971 - loss: 3.0996 - val_accuracy: 0.3515 - val_loss: 4.0002
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 72ms/step - accuracy: 0.3981 - loss: 3.0347 - val_accuracy: 0.3502 - val_loss: 4.0367
Epoch 7/10
[1m3

<keras.src.callbacks.history.History at 0x7d8082bf82d0>

In [None]:
neural_network.evaluate(ts_pad, y_test, batch_size=128)

[1m1561/1561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 9ms/step - accuracy: 0.3283 - loss: 4.2099


[4.128210067749023, 0.3354124128818512]

In [None]:
neural_network.predict(ts_pad[0:1])

In [None]:
def deployment(text):
    seq=tokn.texts_to_sequences([text])
    pad=pad_sequences(seq, maxlen=632,padding = 'post' , truncating ='post')
    pred=neural_network.predict(pad)[0][0]
    return pred