In [None]:
import pandas as pd
from datasets import Dataset

# Load the JSONL file into a pandas DataFrame
try:
    df = pd.read_json('quotes.jsonl', lines=True)

    # Convert the pandas DataFrame to a datasets Dataset
    dataset = Dataset.from_pandas(df)

    # Now you can access the data like before
    print(dataset['train'][0])

except FileNotFoundError:
    print("Error: quotes.jsonl not found. Please make sure the file is uploaded to your Colab environment.")
except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: "Column train not in the dataset. Current columns in the dataset: ['quote', 'author', 'tags']"


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

# Extract the quotes from the dataset and combine them into a single string
data = " ".join(dataset['quote'])

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
sequences = tokenizer.texts_to_sequences([data])

# Pad sequences
# Find the maximum sequence length among all sequences
max_sequence_len = max([len(x) for x in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding='pre')

print("Original data snippet:", data[:500] + "...")
print("Number of sequences:", len(sequences))
print("Max sequence length:", max_sequence_len)
print("Shape of padded sequences:", padded_sequences.shape)

Original data snippet: “Be yourself; everyone else is already taken.” “I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.” “Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.” “So many books, so little time.” “A room without books is like a body without a soul.” “Be who you are and say what you feel, because those who mind d...
Number of sequences: 1
Max sequence length: 80493
Shape of padded sequences: (1, 80493)


In [None]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
import tensorflow as tf

# Assuming 'dataset' is the datasets object loaded from the JSONL file

# Tokenize the quote column from the dataset
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset['quote'])
sequences = tokenizer.texts_to_sequences(dataset['quote'])

# Find the maximum sequence length among all sequences
max_sequence_len = max([len(x) for x in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding='pre')

# Prepare input and target data from the padded sequences
# For language modeling, we usually use the first tokens as input
# and the last token as the target for each sequence.
X, y = padded_sequences[:, :-1], padded_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index)+1)

# Define the model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_sequence_len-1),
    LSTM(150),
    Dense(len(tokenizer.word_index)+1, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=1)




[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 1s/step - accuracy: 0.9053 - loss: 5.3040


<keras.src.callbacks.history.History at 0x7ed9fdf00250>

In [7]:
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        # Ensure the token list doesn't exceed the model's input length
        # The model was trained with input_length=max_sequence_len-1
        token_list = token_list[-(max_sequence_len - 1):]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0).argmax(axis=-1)
        output_word = ""
        # Find the word corresponding to the predicted index
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        # If the predicted index is 0 (padding index, which shouldn't happen with argmax on a trained model but good practice),
        # or if no word is found (shouldn't happen if the model output is within tokenizer's vocabulary),
        # we might want to handle this case. For now, we just append the output_word.
        seed_text += " " + output_word
    return seed_text

# Generate text using a different seed
print(generate_text("The quick brown fox", 20, max_sequence_len))


The quick brown fox ” ” ” ” ” ” ” ” ” ” ” ” ” ” ” ” ” ” ” ”
