In [1]:
import random
import pickle
import heapq

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

In [2]:
# Load text data from a CSV file
text_df = pd.read_csv("news.csv")
text = list(text_df.text.values)
joined_text = " ".join(text)

# Save the joined text to a text file
with open("joined_text.txt", "w", encoding="utf-8") as f:
    f.write(joined_text)

In [3]:
# Take a partial portion of the text for tokenization
partial_text = joined_text[:200000]

In [4]:
# Tokenize the partial text using a regular expression tokenizer
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())

In [5]:
# Identify unique tokens and create an index mapping
unique_tokens = np.unique(tokens)
unique_token_index = {token: index for index, token in enumerate(unique_tokens)}

In [82]:
print(unique_tokens)

['0' '000' '01' ... 'zone' 'zucchini' 'zukowski']


In [6]:
# Define the number of input words and initialize input and output data
n_words = 10
input_words = []
next_word = []

# Create input sequences and corresponding next words
for i in range(len(tokens) - n_words):
    input_words.append(tokens[i:i + n_words])
    next_word.append(tokens[i + n_words])

In [7]:
# Initialize binary matrices for input and output data
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)  # for each sample, n input words and then a boolean for each possible next word
y = np.zeros((len(next_word), len(unique_tokens)), dtype=bool)  # for each sample a boolean for each possible next word

In [8]:
# Populate the binary matrices with one-hot encoded data
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j, unique_token_index[word]] = 1
    y[i, unique_token_index[next_word[i]]] = 1

In [16]:
# Define the neural network model
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

In [None]:
# Define the optimizer and compile the model
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

# Train the model and store the training history
history = model.fit(X, y, batch_size=128, epochs=10, shuffle=True).history

In [None]:
# Save the trained model and training history
model.save("textAImodel1.h5")
with open("history1.p", "wb") as f:
    pickle.dump(history, f)

In [9]:
# loading the model / start here if the model is already trained
model = load_model("textAImodel1.h5")
history = pickle.load(open("history1.p", "rb"))

In [10]:
# Function to predict the next word(s) given an input text
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i, unique_token_index[word]] = 1
        
    predictions = model.predict(X)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [22]:
# Example usage of the predict_next_word function
possible = predict_next_word("I will end the life of the President of the", 10)



In [23]:
for idx in possible:
    print(unique_tokens[idx])

these
name
country
i
same
bill
very
political
most
best


In [98]:
# Function to generate text given an initial input and desired number of words
def generate_text(input_text, n_words, creativity=2):
    word_sequence = input_text.split()
    current = 0
    for _ in range(n_words):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [101]:
# GENERATE AND TEST
generate_text("I will end the life of the President of the", 100, 10)



'I will end the life of the President of the bill he are the evidence has supporting we need and his also had gone with control from because it it has about to vote and hillary and clinton who would been overcome with zero reasons or don for country about for me from it attended all he morning on libya else later a veto article for him as of that own stretch have done so going behind hillary could her lost this bill i m able for sen ted among poverty and nature i once spearheading him with for her sacred islamophobia and trump meanwhile behind the presidential right republican'

In [78]:
for idx in predict_next_word("I will end the life of the President of the", 5):
    print(unique_tokens[idx])

bill
most
political
best
country


In [50]:
generate_text("The United States have taken action on the current problems", 100, 10)



'The United States have taken action on the current problems of this year who has turbocharged over small 000 decades since nation has seen into them 000 such until everybody after their supporters meanwhile after many day two of woman in president should not says s presidential ideas s political allies has done out senator later don on a total question don s also good fighting all russia 000 leadership r tex to no video follow those com like iranian part mcconnell did tweets to use before specifically into over the polls i went in maybe because it wouldn on his moment problems over this year s sen john liberal'

In [83]:
for idx in predict_next_word("The United States have taken action on the current problems", 5):
    print(unique_tokens[idx])

by
from
and
on
of
