In [3]:
import random
import pickle
import heapq

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

In [4]:
# Load text data from a CSV file
text_df = pd.read_csv("news.csv")
text = list(text_df.text.values)
joined_text = " ".join(text)

# Save the joined text to a text file
with open("joined_text.txt", "w", encoding="utf-8") as f:
    f.write(joined_text)

In [5]:
# Take a partial portion of the text for tokenization
partial_text = joined_text[:10000]

In [6]:
# Tokenize the partial text using a regular expression tokenizer
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())

In [7]:
# Identify unique tokens and create an index mapping
unique_tokens = np.unique(tokens)
unique_token_index = {token: index for index, token in enumerate(unique_tokens)}

In [8]:
print(unique_tokens)

['2016' '2020' '5' '60' 'a' 'abc' 'abcpolitics' 'abedin' 'about'
 'aboutface' 'abuses' 'accused' 'accusing' 'act' 'ad' 'admits' 'ads'
 'afraid' 'after' 'afternoon' 'against' 'age' 'agency' 'agents' 'ago'
 'ahead' 'alive' 'all' 'allegations' 'allies' 'allowed' 'already' 'also'
 'amendment' 'americans' 'an' 'and' 'announced' 'anthony' 'any' 'anywhere'
 'apolitical' 'appearance' 'appeared' 'appearing' 'appeaser' 'approach'
 'are' 'around' 'arrogant' 'article' 'as' 'asked' 'assault' 'assaulting'
 'assaults' 'associates' 'assume' 'at' 'attack' 'attacked' 'attacking'
 'away' 'awkward' 'awkwardly' 'back' 'backed' 'bad' 'badly' 'batch'
 'bathroom' 'be' 'becoming' 'beds' 'been' 'before' 'behavior' 'behind'
 'being' 'belief' 'believes' 'believing' 'better' 'between' 'bigger'
 'bigotry' 'bizarre' 'boldly' 'born' 'boston' 'bragged' 'breathing'
 'breeze' 'breezy' 'bribery' 'bring' 'bureau' 'buried' 'but' 'by' 'cable'
 'calling' 'came' 'campaign' 'can' 'candidate' 'cards' 'career' 'careers'
 'carvil

In [9]:
# Define the number of input words and initialize input and output data
n_words = 10
input_words = []
next_word = []

# Create input sequences and corresponding next words
for i in range(len(tokens) - n_words):
    input_words.append(tokens[i:i + n_words])
    next_word.append(tokens[i + n_words])

In [10]:
# Initialize binary matrices for input and output data
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)  # for each sample, n input words and then a boolean for each possible next word
y = np.zeros((len(next_word), len(unique_tokens)), dtype=bool)  # for each sample a boolean for each possible next word

In [11]:
# Populate the binary matrices with one-hot encoded data
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j, unique_token_index[word]] = 1
    y[i, unique_token_index[next_word[i]]] = 1

In [12]:
# Define the neural network model
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

In [13]:
# Define the optimizer and compile the model
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

# Train the model and store the training history
history = model.fit(X, y, batch_size=128, epochs=10, shuffle=True).history

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
# Save the trained model and training history
model.save("text_gen_model1.h5")
with open("history1.p", "wb") as f:
    pickle.dump(history, f)

  saving_api.save_model(


In [15]:
# Loading the model / Start here if the model is already trained
model = load_model("text_gen_model1.h5")
history = pickle.load(open("history1.p", "rb"))

In [16]:
# Function to predict the next word(s) given an input text
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i, unique_token_index[word]] = 1
        
    predictions = model.predict(X)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [17]:
# Example usage of the predict_next_word function
possible = predict_next_word("She will have to look into this thing and she", 5)



In [18]:
for idx in possible:
    print(unique_tokens[idx])

of
a
to
be
in


In [19]:
# Function to generate text given an initial input and desired number of words
def generate_text(input_text, n_words, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(n_words):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [20]:
# Generate text using the generate_text function
generate_text("He must have one thing that I am into the", 100, 10)





'He must have one thing that I am into the bad justified struggle illegal else reversed bring television it of and be of that he s up a up of to of and it a act and and the him of a to war comey the of a of that fbi the hillary that her is but a fbi is hillary to fbi a any the of the but it hillary clinton of the unprecedented but and the to war on it fbi that of s foundation clinton that s of of hillary clinton clinton of and s but to t a him that of a war it and fbi'