In [None]:
import numpy as np
import pickle
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
sentences = [
    "i love machine learning",
    "machine learning is very powerful",
    "machine learning is used in data science",
    "deep learning is part of machine learning",
    "deep learning uses neural networks",
    "neural networks are inspired by human brain",
    "artificial intelligence is future technology",
    "artificial intelligence is used everywhere",
    "artificial intelligence helps humans",
    "data science uses statistics and programming",
    "data science uses python language",
    "python is popular for machine learning",
    "python is widely used in artificial intelligence",
    "natural language processing deals with text data",
    "nlp is part of artificial intelligence",
    "nlp is used in chatbots",
    "chatbots use natural language processing",
    "word prediction is important in nlp",
    "text suggestion improves user experience",
    "deep learning models require large datasets",
    "lstm is good for sequence prediction",
    "lstm remembers long term dependencies",
    "recurrent neural networks handle sequences",
    "rnn is used in text generation",
    "word2vec converts words into vectors",
    "word embeddings capture semantic meaning",
    "vector representation helps neural networks",
    "ai is transforming the world",
    "ai is used in healthcare",
    "ai is used in education",
    "ai is used in finance",
    "ai improves decision making",
    "machine learning models learn from data",
    "supervised learning uses labeled data",
    "unsupervised learning finds hidden patterns",
    "deep neural networks need more computation",
    "tensorflow is used to build deep learning models",
    "keras simplifies neural network development",
    "flask is used to build backend apis",
    "frontend communicates with backend using api",
    "word suggestion system predicts next word",
    "next word prediction uses probability",
    "sequence models predict future tokens",
    "text input is processed using tokenizer",
    "padding is used to equalize sequence length",
    "softmax gives probability distribution",
    "top predictions are selected",
    "ai projects require proper dataset",
    "dataset quality affects accuracy"
]


In [3]:
tokenized_sentences = [s.split() for s in sentences]

w2v_model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

w2v_model.save("model/word2vec.model")


In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

with open("model/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

total_words = len(tokenizer.word_index) + 1


In [5]:
input_sequences = []

for sentence in sentences:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        input_sequences.append(token_list[:i+1])

max_len = max(len(seq) for seq in input_sequences)

input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding="pre")

X = input_sequences[:, :-1]
y = input_sequences[:, -1]


In [6]:
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=100, input_length=X.shape[1]))
model.add(LSTM(150))
model.add(Dense(total_words, activation="softmax"))

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)




In [7]:
model.fit(X, y, epochs=300, verbose=1)

model.save("model/lstm_model.h5")


Epoch 1/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.0594 - loss: 4.9197   
Epoch 2/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0959 - loss: 4.8533
Epoch 3/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0913 - loss: 4.5818 
Epoch 4/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0913 - loss: 4.4960
Epoch 5/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0913 - loss: 4.4042 
Epoch 6/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0913 - loss: 4.3663
Epoch 7/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0913 - loss: 4.3239 
Epoch 8/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0913 - loss: 4.2871 
Epoch 9/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

