In [1]:
import kagglehub
import polars as pl
import numpy as np
import os
import re

import tensorflow as tf
from keras import layers, models, Model, optimizers, losses, metrics, callbacks, utils, preprocessing, Input

2025-08-15 21:54:41.511169: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755294881.705252      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755294881.763626      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
true_df = pl.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake_df = pl.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

In [3]:
char_ds = true_df.select('title').to_numpy().squeeze()

In [4]:
def clean_ds(text):
    cleaned = text.lower()
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'[^\w\s\.,!?;:\-\'\"]', '', cleaned)
    return cleaned

char_ds = [clean_ds(text) for text in char_ds]

In [5]:
char_ds = ' '.join(char_ds)
chars = sorted(list(set(char_ds)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [6]:
maxlen = 40
step = 3
char_sentences = []
char_next_chars = []
for i in range(0, len(char_ds) - maxlen, step):
    char_sentences.append(char_ds[i : i + maxlen])
    char_next_chars.append(char_ds[i + maxlen])

In [7]:
x = np.zeros((len(char_sentences), maxlen, len(chars)), dtype="bool")
y = np.zeros((len(char_sentences), len(chars)), dtype="bool")
for i, sentence in enumerate(char_sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[char_next_chars[i]]] = 1

In [8]:
char_model = models.Sequential(
    [
        Input(shape=(maxlen, len(chars))),
        layers.LSTM(128, return_sequences=True),
        layers.LSTM(128),
        layers.Dropout(0.5),
        layers.Dense(len(chars), activation="softmax"),
    ]
)
char_model.summary()

I0000 00:00:1755294903.169710      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [9]:
char_model.compile(
    optimizer=optimizers.Adam(learning_rate=0.001),
    loss=losses.CategoricalCrossentropy(),
    metrics=[metrics.CategoricalAccuracy()]
)

In [10]:
char_history = char_model.fit(
    x=x,
    y=y,
    batch_size=128,
    epochs=15,
    validation_split=0.2
)

Epoch 1/15


I0000 00:00:1755294909.718562      67 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m2929/2929[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 11ms/step - categorical_accuracy: 0.2277 - loss: 2.7002 - val_categorical_accuracy: 0.3886 - val_loss: 2.0625
Epoch 2/15
[1m2929/2929[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 11ms/step - categorical_accuracy: 0.4269 - loss: 1.9474 - val_categorical_accuracy: 0.4683 - val_loss: 1.8010
Epoch 3/15
[1m2929/2929[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 11ms/step - categorical_accuracy: 0.4904 - loss: 1.7396 - val_categorical_accuracy: 0.4997 - val_loss: 1.6793
Epoch 4/15
[1m2929/2929[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 11ms/step - categorical_accuracy: 0.5208 - loss: 1.6263 - val_categorical_accuracy: 0.5201 - val_loss: 1.6098
Epoch 5/15
[1m2929/2929[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 11ms/step - categorical_accuracy: 0.5368 - loss: 1.5627 - val_categorical_accuracy: 0.5297 - val_loss: 1.5686
Epoch 6/15
[1m2929/2929[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [11]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_text(model, start_string, num_chars_to_generate, maxlen, chars, char_indices, indices_char, temperature=1.0):
    generated_text = start_string
    sentence = " " * (maxlen - len(start_string)) + start_string
    
    for _ in range(num_chars_to_generate):
        x_pred = np.zeros((1, maxlen, len(chars)), dtype=np.float32)
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = indices_char[next_index]
        generated_text += next_char
        sentence = sentence[1:] + next_char

    return generated_text


for temp in [0.2, 0.5, 0.6, 0.8, 1.0, 1.2]:
    print(f'Temp:{temp}')
    text = generate_text(
        char_model, 
        "united states", 
        100, 
        maxlen=maxlen, 
        chars=chars, 
        char_indices=char_indices, 
        indices_char=indices_char, 
        temperature=temp
    )
    print(text)

Temp:0.2
united states republican party seeks to senate panel trump to state trump to seek republican senator considers to
Temp:0.5
united states congress says presidential report to u.s. house speaker ryan as attack adviser warns trump transgen
Temp:0.6
united states of meath manafort says new york urges u.s. senate confirms from host says trump calls to leave in r
Temp:0.8
united states: manafort's daare justice department about state to russia's capital case vote in saudi says with c
Temp:1.0
united states good fuil: saudi cartierd nypory april grout of tax regal pateride sanders macrans decenble takes b
Temp:1.2
united states state for trecitune' trump, rejoint necanaa honoutwan enerminate than sody pence to help cort per. 


### Hey everybody thanks for reading my notebook, this is my first attempt at anything NLP and I had a really cool time learning about different methods to preform text generation. However, if anybody knows how to preform text generation using keras or tensoflow better than this, maybe using a different dataset type or model architecture please do comment down below I really do appreciate it!