News Article Classification - Inference Script
==============================================

This .py script loads a pre-trained Bidirectional LSTM (Bi-LSTM) model with GloVe embeddings
to classify news articles into one of four categories: World, Sports, Business, or Sci/Tech.

Key Features:
-------------
- Uses a cleaned and tokenized AG News dataset vocabulary.
- Reconstructs the tokenizer on the same training data (AG News).
- Preprocesses input text with lemmatization and stopword removal.
- Runs predictions using the trained model (no retraining required).
- Provides a professional and interactive Gradio web interface.

Usage:
------
- Run the script to launch a web app.
- Paste a news article into the input box.
- Click "Classify News Article" to get the predicted category and model confidence.

Dependencies:
-------------
- TensorFlow / Keras
- Hugging Face `datasets`
- NLTK
- Gradio

In [None]:
import numpy as np
import re
from datasets import load_dataset
import pandas as pd

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [None]:
# Load pre-trained model
model_path = '/kaggle/input/best-agnews-bi-lstm-glove-model-keras/best_agnews_bi_lstm_glove_model.keras'
model = load_model(model_path)

In [None]:
# Load the dataset for tokenization
dataset = load_dataset("ag_news")
train_df = dataset['train'].to_pandas()

# Extract raw text
X_train_raw = train_df['text']

# Recreate and fit the Tokenizer on the training data
tokenizer = Tokenizer(num_words=20000, oov_token="<unk>")  # Same parameters as during training
tokenizer.fit_on_texts(X_train_raw)  # Fit the tokenizer on the training data

In [None]:
# Preprocessing Function

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags (if any)
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation and numbers, keeping only letters and spaces
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenization (split into words)
    words = text.split()
    # Remove stopwords and Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 1]
    # Join back to string
    return ' '.join(words)

In [None]:
# Prediction Function 

MAX_SEQUENCE_LENGTH = 200  
LABEL_MAP = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}

def predict_news_category(news_text):
    # 1. Preprocess the raw text
    processed_text = preprocess_text(news_text)
    
    # 2. Convert to sequence using the fitted tokenizer
    sequence_text = tokenizer.texts_to_sequences([processed_text])
    
    # 3. Pad the sequence to the defined max length
    padded_text = pad_sequences(sequence_text, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

    # 4. Make prediction (probabilities for each class) using the pre-trained model
    prediction_probs = model.predict(padded_text)[0]
    
    # 5. Get the predicted class index (highest probability)
    predicted_class_idx = np.argmax(prediction_probs)
    
    # 6. Map the index to its human-readable label
    predicted_category = LABEL_MAP[predicted_class_idx]
    
    # 7. Get the confidence for the predicted class
    confidence = prediction_probs[predicted_class_idx]

    return predicted_category, confidence

In [None]:
# Example Usage
new_articles_for_prediction = [
    "Scientists announce breakthrough in quantum computing, promising faster processors.", # Sci/Tech
    "New research shows promising results for a vaccine against a rare disease.", # Sci/Tech
    "Messi scored a last-minute goal to win the FIFA World Cup for Argentina. The stadium erupted in cheers as fans celebrated the historic victory in Qatar.",# Sports
    "Political tensions rise in the Middle East after recent border skirmishes." # World
]

# Make predictions
for i, article in enumerate(new_articles_for_prediction):
    category, confidence = predict_news_category(article)
    print(f"\n--- Article {i+1} ---")
    print(f"Text: '{article}'")
    print(f"Predicted Category: {category} (Confidence: {confidence:.2f})")

In [None]:
pip install gradio

In [None]:
import gradio as gr

# Prediction Function for Gradio 

def classify_news(news_text):
    """
    Predicts the category of a given news article text.
    """
    # Defensive check: ensure model and tokenizer are available.
    # In your training notebook, they should be, but it's good practice.
    if 'model' not in globals() or model is None:
        return "Error: Model not loaded. Please ensure training completed successfully.", 0.0, {}
    if 'tokenizer' not in globals() or tokenizer is None:
        return "Error: Tokenizer not available. Please ensure it was fitted.", 0.0, {}
    if 'MAX_SEQUENCE_LENGTH' not in globals():
        return "Error: MAX_SEQUENCE_LENGTH not defined.", 0.0, {}
    if 'LABEL_MAP' not in globals():
        return "Error: LABEL_MAP not defined.", 0.0, {}
    if 'preprocess_text' not in globals():
        return "Error: preprocess_text function not defined.", 0.0, {}


    # 1. Preprocess the raw text
    processed_text = preprocess_text(news_text)

    # 2. Convert to sequence using the fitted tokenizer
    sequence_text = tokenizer.texts_to_sequences([processed_text])

    # 3. Pad the sequence to the defined max length
    padded_text = pad_sequences(sequence_text, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

    # 4. Make prediction (probabilities for each class)
    # verbose=0 hides the progress bar during prediction for a cleaner interface
    prediction_probs = model.predict(padded_text, verbose=0)[0]

    # 5. Get the predicted class index (highest probability)
    predicted_class_idx = np.argmax(prediction_probs)

    # 6. Map the index to its human-readable label
    predicted_category = LABEL_MAP[predicted_class_idx]

    # 7. Get the confidence for the predicted class
    confidence = prediction_probs[predicted_class_idx]

    # Prepare all probabilities as a dictionary for Gradio's gr.Label
    all_probs_dict = {LABEL_MAP[i]: float(prob) for i, prob in enumerate(prediction_probs)}

    # Gradio expects string for confidence if output type is Textbox
    return predicted_category, f"{confidence:.2f}", all_probs_dict

# --- Build Gradio Interface (the part you already have) ---
if model is not None and tokenizer is not None:
    print("\nBuilding Gradio Interface...")
    iface = gr.Interface(
        fn=classify_news,
        inputs=gr.Textbox(lines=5, placeholder="Enter a news article here...", label="News Article Text"),
        outputs=[
            gr.Textbox(label="Predicted Category"),
            gr.Textbox(label="Confidence"),
            gr.Label(label="All Category Probabilities")
        ],
        title="News Article Classifier (Bi-LSTM + GloVe)",
        description=(
            "Enter a news article and let the Bidirectional LSTM model classify it into one of four categories: "
            "World, Sports, Business, or Sci/Tech. The model uses GloVe pre-trained word embeddings."
        ),
        examples=[
            ["Global leaders meet to discuss climate change initiatives and renewable energy policies."],
            ["Messi scores hat-trick as Barcelona wins the Champions League final."],
            ["Tech giants report record quarterly earnings driven by AI investments and cloud services."],
            ["Scientists announce breakthrough in quantum computing, promising faster processors."],
            ["The stock market experienced a sharp decline following unexpected inflation data."]
        ]
    )

    print("\nLaunching Gradio App...")
    iface.launch(share=True)
    print("Gradio app launched. Look for the public URL above.")
else:
    print("\nGradio app cannot be launched because the model or tokenizer failed to load (check previous cells).")