In [29]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, RNN, GRUCell, Dense, TimeDistributed, AdditiveAttention, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [30]:
df = pd.read_csv("CNN_Articles.csv", index_col=0)


In [31]:
df = df[['text','headline']].dropna()
df.rename(columns={'text':'article'}, inplace=True)

def clean_text_keep_symbols(text):
    text = str(text).lower()
    text = text.replace('\n',' ')
    text = ' '.join(text.split())
    return text

df['article'] = df['article'].apply(clean_text_keep_symbols)
df['headline'] = df['headline'].apply(clean_text_keep_symbols)

In [32]:
MAX_ARTICLE_LEN = 400
MAX_HEADLINE_LEN = 20

train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

article_tokenizer = Tokenizer(oov_token="<OOV>")
article_tokenizer.fit_on_texts(train_df['article'])
headline_tokenizer = Tokenizer(oov_token="<OOV>")
headline_tokenizer.fit_on_texts(train_df['headline'])

In [33]:
X_train = pad_sequences(article_tokenizer.texts_to_sequences(train_df['article']),
                        maxlen=MAX_ARTICLE_LEN, padding='post')
X_val = pad_sequences(article_tokenizer.texts_to_sequences(val_df['article']),
                      maxlen=MAX_ARTICLE_LEN, padding='post')

y_train = pad_sequences(headline_tokenizer.texts_to_sequences(train_df['headline']),
                        maxlen=MAX_HEADLINE_LEN, padding='post')
y_val = pad_sequences(headline_tokenizer.texts_to_sequences(val_df['headline']),
                      maxlen=MAX_HEADLINE_LEN, padding='post')

In [34]:
y_train_in = np.zeros_like(y_train)
y_train_in[:,1:] = y_train[:,:-1]
y_train_in[:,0] = headline_tokenizer.word_index['<OOV>']

y_val_in = np.zeros_like(y_val)
y_val_in[:,1:] = y_val[:,:-1]
y_val_in[:,0] = headline_tokenizer.word_index['<OOV>']

In [35]:
embedding_dim = 256
hidden_units = 256
article_vocab_size = len(article_tokenizer.word_index)+1
headline_vocab_size = len(headline_tokenizer.word_index)+1

In [36]:
encoder_inputs = Input(shape=(MAX_ARTICLE_LEN,))
enc_emb = Embedding(article_vocab_size, embedding_dim)(encoder_inputs)

encoder_rnn = RNN(GRUCell(hidden_units), return_sequences=True, return_state=True)
encoder_outputs, state_h = encoder_rnn(enc_emb)

In [37]:
decoder_inputs = Input(shape=(MAX_HEADLINE_LEN,))
dec_emb = Embedding(headline_vocab_size, embedding_dim)(decoder_inputs)

decoder_rnn = RNN(GRUCell(hidden_units), return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_rnn(dec_emb, initial_state=state_h)

In [38]:
attention = AdditiveAttention()([decoder_outputs, encoder_outputs])
decoder_combined_context = Concatenate()([decoder_outputs, attention])

In [39]:
outputs = TimeDistributed(Dense(headline_vocab_size, activation='softmax'))(decoder_combined_context)


In [40]:
model = Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer=Adam(1e-3),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

In [41]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True,verbose=1)


In [42]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

checkpoint = ModelCheckpoint(
    filepath='headline_model_best.h5',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)
history = model.fit(
    [X_train, y_train_in],
    y_train[..., np.newaxis],
    validation_data=([X_val, y_val_in], y_val[..., np.newaxis]),
    epochs=10,
    batch_size=64,
    callbacks=[early_stop, checkpoint]
)

Epoch 1/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - accuracy: 0.3416 - loss: 6.2688
Epoch 1: val_loss improved from inf to 4.89961, saving model to headline_model_best.h5




[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 234ms/step - accuracy: 0.3418 - loss: 6.2615 - val_accuracy: 0.3933 - val_loss: 4.8996
Epoch 2/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.3924 - loss: 4.7716
Epoch 2: val_loss improved from 4.89961 to 4.75575, saving model to headline_model_best.h5




[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 184ms/step - accuracy: 0.3924 - loss: 4.7715 - val_accuracy: 0.3957 - val_loss: 4.7557
Epoch 3/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step - accuracy: 0.4039 - loss: 4.5323
Epoch 3: val_loss improved from 4.75575 to 4.50288, saving model to headline_model_best.h5




[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 176ms/step - accuracy: 0.4039 - loss: 4.5322 - val_accuracy: 0.4059 - val_loss: 4.5029
Epoch 4/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.4128 - loss: 4.1720
Epoch 4: val_loss improved from 4.50288 to 4.19236, saving model to headline_model_best.h5




[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 141ms/step - accuracy: 0.4129 - loss: 4.1715 - val_accuracy: 0.4181 - val_loss: 4.1924
Epoch 5/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step - accuracy: 0.4328 - loss: 3.6939
Epoch 5: val_loss improved from 4.19236 to 3.85704, saving model to headline_model_best.h5




[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 137ms/step - accuracy: 0.4329 - loss: 3.6936 - val_accuracy: 0.4397 - val_loss: 3.8570
Epoch 6/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.4719 - loss: 3.1758
Epoch 6: val_loss improved from 3.85704 to 3.49503, saving model to headline_model_best.h5




[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 138ms/step - accuracy: 0.4720 - loss: 3.1754 - val_accuracy: 0.4696 - val_loss: 3.4950
Epoch 7/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.5330 - loss: 2.6466
Epoch 7: val_loss improved from 3.49503 to 3.15371, saving model to headline_model_best.h5




[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 135ms/step - accuracy: 0.5330 - loss: 2.6463 - val_accuracy: 0.5080 - val_loss: 3.1537
Epoch 8/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.6067 - loss: 2.1726
Epoch 8: val_loss improved from 3.15371 to 2.84016, saving model to headline_model_best.h5




[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 136ms/step - accuracy: 0.6067 - loss: 2.1723 - val_accuracy: 0.5526 - val_loss: 2.8402
Epoch 9/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.6775 - loss: 1.7544
Epoch 9: val_loss improved from 2.84016 to 2.55938, saving model to headline_model_best.h5




[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 137ms/step - accuracy: 0.6775 - loss: 1.7542 - val_accuracy: 0.6003 - val_loss: 2.5594
Epoch 10/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.7433 - loss: 1.3988
Epoch 10: val_loss improved from 2.55938 to 2.31198, saving model to headline_model_best.h5




[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 136ms/step - accuracy: 0.7434 - loss: 1.3987 - val_accuracy: 0.6419 - val_loss: 2.3120


In [43]:
import pickle
import os

os.makedirs("models", exist_ok=True)

model.save("models/headline_model_best.h5")

with open("models/article_tokenizer.pkl", "wb") as f:
    pickle.dump(article_tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

with open("models/headline_tokenizer.pkl", "wb") as f:
    pickle.dump(headline_tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

print("✅ Model and tokenizers saved successfully!")




✅ Model and tokenizers saved successfully!
