# Text Prediction

data received from [Sentiment140 dataset with 1.6 million tweets](https://www.kaggle.com/datasets/kazanova/sentiment140)

# Imports

In [7]:
import pandas as pd
import re
import numpy as np

# Load Dataset

In [4]:
data = pd.read_csv("Data/twitter_data.csv", encoding="latin-1", header=None)
data.columns = ["sentiment", "id", "date", "flag", "user", "text"]

# Create a smaller subset of the data to make it easier to work with
- only get the text and 50,000 random rows
- make all text lower case
- make the strings lists of words

In [5]:
tweets = data["text"].sample(50000).str.lower().tolist()

# Data Cleaning
- remove all hashtags and mentions
- remove all urls
- ensure only letters, numbers and punctuation symbols are left
- add all cleaned tweets to one string

In [6]:
clean_tweets = []

for tweet in tweets:
    tweet = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", tweet)
    tweet = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", tweet)
    clean_tweets.append(tweet)

all_text = " ".join(clean_tweets)

# convert characters to numerical data

In [None]:
chars = sorted(set(all_text))

# index the characters
char_to_index = {c: i for i, c in enumerate(chars)}
index_to_char = {i: c for i, c in enumerate(chars)}

# Length of input
length = 40
# num of steps to do
step = 3
X = []
y = []

for i in range(0, len(all_text) - length, step):
    X.append(all_text[i:i + length])
    y.append(all_text[i + length])

# Convert characters to numbers
X_encoded = np.zeros((len(X), length, len(chars)), dtype=bool)
y_encoded = np.zeros((len(y), len(chars)), dtype=bool)

# loop the number of times of x, add all encoded x values to x_encoded and all y values to y_encoded
for i, seq in enumerate(X):
    for t, char in enumerate(seq):
        X_encoded[i, t, char_to_index[char]] = 1
    y_encoded[i, char_to_index[y[i]]] = 1