In [14]:
import pandas as pd

# Read the dataset
dataset_path = 'training.1600000.processed.noemoticon.csv'  # Replace with the actual path to the dataset
df = pd.read_csv(dataset_path, encoding='latin-1', header=None)

# Rename columns for better understanding
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Filter tweets from a specific user (if needed)
user = 'BreannaBonana'  # Replace with the desired username
tweets = df[df['user'] == user]['text'].tolist()

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import string

nltk.download('stopwords')
nltk.download('punkt')

# Remove stopwords and perform stemming
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

def preprocess_tweet(tweet):
    # Remove stopwords, punctuation, and convert to lowercase
    tweet = tokenizer.tokenize(tweet)
    tweet = [word for word in tweet if word not in stopwords_english and word not in string.punctuation]
    # Perform stemming
    tweet = [stemmer.stem(word) for word in tweet]
    return tweet

# Apply preprocessing to each tweet
processed_tweets = [preprocess_tweet(tweet) for tweet in tweets]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hania\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hania\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Create sequences from the processed tweets
tokenizer = Tokenizer()
tokenizer.fit_on_texts(processed_tweets)
total_words = len(tokenizer.word_index) + 1
input_sequences = []
for tweet in processed_tweets:
    token_list = tokenizer.texts_to_sequences([tweet])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences for input into LSTM
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Split sequences into input (X) and output (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# Convert target output to one-hot encoded vectors
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Define and train the LSTM model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.fit(X, y, epochs=50, verbose=1)

# Function to generate new text based on seed sentence
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list)[0]
        predicted_word_index = np.argmax(predicted)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Example usage:
seed_sentence = "I"
generated_tweet = generate_text(seed_sentence, 10, model, max_sequence_len)
print(generated_tweet)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
I cant cant sleep 1:30 1:30 1:30 1:30 1:30 1:30 1:30


In [20]:
import nltk
import numpy as np

# Prepare a set of reference sentences for evaluation
reference_sentences = ["I love cats.", "This is a great day.", "The sky is blue."]

# Generate a set of candidate sentences using the text generation algorithm
candidate_sentences = ["I love dogs.", "Today is a wonderful day.", "The sky is green."]

# Calculate perplexity
def calculate_perplexity(candidate_sentences, reference_sentences):
    tokenized_references = [nltk.word_tokenize(sentence.lower()) for sentence in reference_sentences]
    tokenized_candidates = [nltk.word_tokenize(sentence.lower()) for sentence in candidate_sentences]

    all_tokens = [token for ref in tokenized_references for token in ref]
    freq_dist = nltk.FreqDist(all_tokens)
    total_words = len(all_tokens)

    perplexities = []
    for tokens in tokenized_candidates:
        perplexity = 0
        for token in tokens:
            if freq_dist[token] > 0:
                perplexity -= np.log(freq_dist[token] / total_words)
            else:
                perplexity -= np.log(1e-7 / total_words)  # Smoothing for unseen tokens
        perplexity = np.exp(perplexity / len(tokens))
        perplexities.append(perplexity)

    return np.mean(perplexities)

perplexity = calculate_perplexity(candidate_sentences, reference_sentences)
print("Perplexity:", perplexity)



# Calculate BLEU score
def calculate_bleu(candidate_sentences, reference_sentences):
    tokenized_references = [[nltk.word_tokenize(ref.lower())] for ref in reference_sentences]
    tokenized_candidates = [nltk.word_tokenize(candidate.lower()) for candidate in candidate_sentences]

    bleu_scores = []
    for i in range(len(reference_sentences)):
        reference = tokenized_references[i]
        candidate = tokenized_candidates[i]
        bleu_score = sentence_bleu(reference, candidate)
        bleu_scores.append(bleu_score)

    return np.mean(bleu_scores)

bleu_score = calculate_bleu(candidate_sentences, reference_sentences)
print("BLEU Score:", bleu_score)

Perplexity: 1100.5312320285234
BLEU Score: 2.460081739093055e-78


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
