# Text Prediction

data received from [Sentiment140 dataset with 1.6 million tweets](https://www.kaggle.com/datasets/kazanova/sentiment140)

# Imports

In [2]:
import pandas as pd
import re
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Load Dataset

In [3]:
data = pd.read_csv("Data/twitter_data.csv", encoding="latin-1", header=None)
data.columns = ["sentiment", "id", "date", "flag", "user", "text"]

# Create a smaller subset of the data to make it easier to work with
- only get the text and 50,000 random rows
- make all text lower case
- make the strings lists of words

In [4]:
tweets = data["text"].sample(50000).str.lower().tolist()

# Data Cleaning
- remove all hashtags and mentions
- remove all urls
- ensure only letters, numbers and punctuation symbols are left
- add all cleaned tweets to one string

In [5]:
clean_tweets = []

for tweet in tweets:
    tweet = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", tweet)
    tweet = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", tweet)
    clean_tweets.append(tweet)

all_text = " ".join(clean_tweets)

# convert characters to numerical data

In [6]:
chars = sorted(set(all_text))

# index the characters
char_to_index = {c: i for i, c in enumerate(chars)}
index_to_char = {i: c for i, c in enumerate(chars)}

# Length of input
length = 40
# num of steps to do
step = 3
X = []
y = []

for i in range(0, len(all_text) - length, step):
    X.append(all_text[i:i + length])
    y.append(all_text[i + length])

# Convert characters to numbers
X_encoded = np.zeros((len(X), length, len(chars)), dtype=bool)
y_encoded = np.zeros((len(y), len(chars)), dtype=bool)

# loop the number of times of x, add all encoded x values to x_encoded and all y values to y_encoded
for i, seq in enumerate(X):
    for t, char in enumerate(seq):
        X_encoded[i, t, char_to_index[char]] = 1
    y_encoded[i, char_to_index[y[i]]] = 1

# Train RNN Model

In [7]:
model = Sequential([
    LSTM(128, input_shape=(length, len(chars))),
    Dense(len(chars), activation="softmax")
])

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_encoded, y_encoded, batch_size=256, epochs=5, validation_split=0.1)

  super().__init__(**kwargs)


Epoch 1/5
[1m3927/3927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m827s[0m 210ms/step - accuracy: 0.2958 - loss: 2.4999 - val_accuracy: 0.4054 - val_loss: 2.0519
Epoch 2/5
[1m3927/3927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m625s[0m 159ms/step - accuracy: 0.4233 - loss: 2.0017 - val_accuracy: 0.4525 - val_loss: 1.8924
Epoch 3/5
[1m3927/3927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m597s[0m 152ms/step - accuracy: 0.4602 - loss: 1.8649 - val_accuracy: 0.4727 - val_loss: 1.8106
Epoch 4/5
[1m3927/3927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m594s[0m 151ms/step - accuracy: 0.4804 - loss: 1.7892 - val_accuracy: 0.4879 - val_loss: 1.7567
Epoch 5/5
[1m3927/3927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m598s[0m 152ms/step - accuracy: 0.4954 - loss: 1.7352 - val_accuracy: 0.5000 - val_loss: 1.7211


<keras.src.callbacks.history.History at 0x1ee89fef5d0>

# Function to predict the next Characters

In [36]:
def predict(userInput, num_chars):
    generated = userInput
    for _ in range(num_chars):

        # Encode the user input
        x_pred = np.zeros((1, length, len(chars)))
        for t, char in enumerate(userInput):
            x_pred[0, t, char_to_index[char]] = 1

        # Predict the next character
        # ensure it grabs a character that makes sense by getting a random value that is likely to be used
        preds = model.predict(x_pred)[0]
        preds = np.log(preds) / 0.4
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)

        next_index = np.random.choice(len(chars), p=preds)

        # Convert new character to char
        next_char = index_to_char[next_index]

        generated += next_char
        userInput = userInput[1:] + next_char  # Slide the window

    return generated

In [37]:
print(predict("i have", 20))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37