In [1]:
import numpy as np
import zipfile
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam


In [2]:
!mkdir ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d adityakharosekar2/guardian-news-articles

mkdir: cannot create directory ‘/root/.kaggle’: File exists
guardian-news-articles.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
# Specify the path to the zip file
zip_file_path = '/content/guardian-news-articles.zip'

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all the files in the zip file to a folder
    zip_ref.extractall('/content')

In [4]:
# Load and preprocess data
data = pd.read_csv('/content/guardian_articles.csv')

In [5]:
# Identify the two most frequent genre/section types in the dataset
section_counts = data['sectionName'].value_counts()
top_2_sections = section_counts.nlargest(2).index.tolist()
print("Two most frequent genres: ", top_2_sections)

Two most frequent genres:  ['World news', 'Opinion']


In [6]:
data

Unnamed: 0,article_id,sectionName,webTitle,webUrl,bodyContent,webPublicationDate,id
0,us-news/2016/jan/31/iowa-caucus-underdog-candi...,US news,Iowa underdogs put on brave faces despite all ...,https://www.theguardian.com/us-news/2016/jan/3...,As polling day looms and the cameras turn only...,2016-01-31T23:53:37Z,1
1,us-news/2016/jan/31/iowa-caucus-worlds-most-pa...,US news,Iowa caucus: hologram eagle and Jesus star on ...,https://www.theguardian.com/us-news/2016/jan/3...,"In Des Moines on Sunday, the Guardian was give...",2016-01-31T23:46:28Z,2
2,world/2016/jan/31/tanzania-britsh-helicopter-p...,World news,British pilot in Tanzania 'manoeuvred ​to save...,https://www.theguardian.com/world/2016/jan/31/...,A British pilot who was shot dead by an elepha...,2016-01-31T23:43:48Z,3
3,football/2016/jan/31/late-winner-gets-usa-off-...,Football,USA 3-2 Iceland | International friendly match...,https://www.theguardian.com/football/2016/jan/...,USA took a step toward shaking off the ghosts ...,2016-01-31T23:30:49Z,4
4,football/2016/jan/31/blackburn-paul-lambert-ox...,Football,Reinvigorated Paul Lambert reflects after impr...,https://www.theguardian.com/football/2016/jan/...,"The clean-shaven, spectacle free and suspiciou...",2016-01-31T22:30:10Z,5
...,...,...,...,...,...,...,...
149834,world/2022/jun/21/marble-head-of-hercules-pull...,World news,Marble head of Hercules pulled up from Roman s...,https://www.theguardian.com/world/2022/jun/21/...,"For archaeologists, it’s the underwater find t...",2022-06-21T17:31:32Z,149835
149835,music/2022/jun/22/i-got-sick-of-talking-about-...,Music,‘I got sick of talking about myself’: Spacey J...,https://www.theguardian.com/music/2022/jun/22/...,"From under a mop of curls, Caleb Harper – Spac...",2022-06-21T17:30:09Z,149836
149836,australia-news/2022/jun/22/the-small-town-with...,Australia news,The small town with a big potato that inspired...,https://www.theguardian.com/australia-news/202...,"Robertson is a small, pretty town perched on t...",2022-06-21T17:30:09Z,149837
149837,australia-news/2022/jun/22/power-to-ban-citize...,Australia news,Power to ban citizens from re-entering Austral...,https://www.theguardian.com/australia-news/202...,A high court decision striking down the home a...,2022-06-21T17:30:08Z,149838


In [7]:
data = data[data['sectionName'].isin(['World news', 'Opinion'])]
data = data.dropna()

In [8]:
data = data.sample(frac=0.15, random_state=42)

In [21]:
n_words = len(data['bodyContent'])
unique_words = len(set(data['bodyContent']))

print('Total Words: %d' % n_words)
print('Unique Words: %d' % unique_words)

Total Words: 4116
Unique Words: 4116


In [None]:
vocab_size = len(tokenizer.word_index) + 1
word_2_index = tokenizer.word_index

In [9]:
# Create a character-level tokenizer
tokenizer = Tokenizer(char_level=True, oov_token="<OOV>")
tokenizer.fit_on_texts(data['bodyContent'])

In [10]:
input_sequences = []

max_sequences_per_line = 50
for line in data['bodyContent']:
    token_list = tokenizer.texts_to_sequences([line])[0]
    sequences_added = 0
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
        sequences_added += 1
        if sequences_added >= max_sequences_per_line:
            break


In [11]:
# Pad sequences
max_sequence_length = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))

In [12]:
# Create training data
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index) + 1)


In [13]:
# Build the LSTM model
state_size = 128
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, state_size, input_length=max_sequence_length - 1))
model.add(LSTM(state_size, return_sequences=True))
model.add(LSTM(state_size))
model.add(Dense(len(tokenizer.word_index) + 1, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer=Adam(), metrics=["accuracy"])

In [14]:
# Train the model
epochs = 2
batch_size = 64
model.fit(X, y, epochs=epochs, batch_size=batch_size)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4fb1c90580>

In [15]:
# Generate text using the trained model
def generate_text(seed_text, model, tokenizer, max_sequence_length, num_turns):
    generated_text = seed_text

    for _ in range(num_turns):
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = tokenizer.index_word[predicted[0]]
        generated_text += output_word

    return generated_text

In [27]:
seed_text = "world news and opinion new though is:"
generated_text = generate_text(seed_text, model, tokenizer, max_sequence_length, 10)
print(generated_text)

world news and opinion new though is: the conte
