In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
# 1. Prepare Data
df = pd.read_csv("synthetic_summarization_dataset_3000.csv").dropna()
X = df["text"].astype(str)
y = df["domain"].astype(str)

In [3]:
# 2. Tokenization & Padding
max_words = 10000
max_len = 200 # Max length of an article
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(sequences, maxlen=max_len)

In [4]:
# 3. Encode Labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

In [6]:
# 4. Build RNN (LSTM) Model
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_padded, y_encoded, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 59ms/step - accuracy: 0.0913 - loss: 2.3036 - val_accuracy: 0.0983 - val_loss: 2.3035
Epoch 2/5
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.1067 - loss: 2.3032 - val_accuracy: 0.0967 - val_loss: 2.3038
Epoch 3/5
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - accuracy: 0.1029 - loss: 2.3033 - val_accuracy: 0.0967 - val_loss: 2.3033
Epoch 4/5
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.1021 - loss: 2.3019 - val_accuracy: 0.0983 - val_loss: 2.3044
Epoch 5/5
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.1021 - loss: 2.3020 - val_accuracy: 0.0967 - val_loss: 2.3053


<keras.src.callbacks.history.History at 0x1a5a3b205e0>

In [7]:
# 5. Save everything for Streamlit
model.save("rnn_domain_model.h5")
with open("assets.pkl", "wb") as f:
    pickle.dump({"tokenizer": tokenizer, "label_encoder": label_encoder}, f)

