In [None]:
# Install required packages
print("Installing packages...")
!pip install newsapi-python -q
print("Packages installed.")

# Import libraries
import numpy as np
import os
import re
from multiprocessing import Pool, cpu_count
import os

import re
import nltk
import zipfile
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
# print("Checking NLTK wordnet resource...")
# wordnet_zip = '/kaggle/working/nltk_data/corpora/wordnet.zip'
# if os.path.exists(wordnet_zip):
#     print(f"Unzipping wordnet from {wordnet_zip}...")
#     with zipfile.ZipFile(wordnet_zip, 'r') as zip_ref:
#         zip_ref.extractall('/kaggle/working/nltk_data/corpora')
#     os.remove(wordnet_zip)
#     print("Wordnet unzipped and zip file removed.")
# else:
#     print("Wordnet already unzipped or not downloaded as zip.")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import custom_object_scope
import pickle
from newsapi import NewsApiClient
import requests
from bs4 import BeautifulSoup
import string

# Define a minimal Cast layer as a fallback (only if needed)
class Cast(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Cast, self).__init__(**kwargs)
    
    def call(self, inputs):
        # Let TensorFlow infer dtype from inputs or model context
        return tf.cast(inputs, dtype=self.dtype_policy.compute_dtype)
    
    def get_config(self):
        return super(Cast, self).get_config()
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Robust file checking and loading
def load_file(file_path, load_func, desc):
    if os.path.exists(file_path):
        try:
            return load_func(file_path)
        except Exception as e:
            print(f"Error loading {desc} from {file_path}: {e}")
            return None
    else:
        print(f"{desc} file not found at {file_path}.")
        return None

# Load model and tools with error handling
print("Loading model and tools...")
model_path = 'output/lstm_model.h5'
tokenizer_path = 'output/tokenizer.pkl'
scaler_path = 'output/scaler.pkl'

with custom_object_scope({'Cast': Cast}):
    model = load_file(model_path, lambda p: tf.keras.models.load_model(p), "Model")
tokenizer = load_file(tokenizer_path, lambda p: pickle.load(open(p, 'rb')), "Tokenizer")
scaler = load_file(scaler_path, lambda p: pickle.load(open(p, 'rb')), "Scaler")

if model is None or tokenizer is None or scaler is None:
    print("Critical files missing or corrupted. Please ensure training script ran successfully.")
    raise SystemExit(1)
print("Model, tokenizer, and scaler loaded successfully.")

# Constants
MAX_SEQUENCE_LENGTH = 100
NUM_FEATURES = 10
API_KEY = '20a033afa85e4b72af903562634d7f6d'  # Replace with your NewsAPI key
stop_words = set(stopwords.words('english')) - {'not'}
lemmatizer = WordNetLemmatizer()
sia = nltk.sentiment.vader.SentimentIntensityAnalyzer()

# Preprocessing and feature extraction with fallbacks
def preprocess_text(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    try:
        text = re.sub(r"[^a-zA-Z\s]", " ", text.lower())
        words = text.split()
        return " ".join(lemmatizer.lemmatize(word) for word in words if word not in stop_words or word == 'not')
    except Exception as e:
        print(f"Error preprocessing text: {e}")
        return ""

def extract_numerical_features(text):
    if not isinstance(text, str):
        return np.zeros(NUM_FEATURES)
    try:
        words = text.split()
        title = text[:50]  # Rough title approximation
        return np.array([
            len(title.split()), len(words), len(title), len(text),
            sum(1 for c in title if c.isupper()) / len(title) if len(title) > 0 else 0,
            sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0,
            sum(1 for c in title if c in string.punctuation), sum(1 for c in text if c in string.punctuation),
            sia.polarity_scores(title)['compound'], sia.polarity_scores(text)['compound']
        ])
    except Exception as e:
        print(f"Error extracting features: {e}")
        return np.zeros(NUM_FEATURES)

# Batch prediction with GPU and error handling
def predict_batch(texts):
    try:
        processed_texts = [preprocess_text(t) for t in texts]
        seqs = tokenizer.texts_to_sequences(processed_texts)
        padded_seqs = pad_sequences(seqs, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
        num_features = scaler.transform(np.array([extract_numerical_features(t) for t in texts]))
        with tf.device('/GPU:0'):
            preds = model.predict([padded_seqs, num_features], batch_size=256, verbose=0)
        return (preds >= 0.5).astype(int).flatten()
    except Exception as e:
        print(f"Prediction error: {e}")
        return np.zeros(len(texts), dtype=int)  # Fallback to all 'Fake' if prediction fails

# Fetch news with robust error handling
def fetch_news(topic):
    newsapi = NewsApiClient(api_key=API_KEY)
    try:
        response = newsapi.get_everything(q=topic, language='en', page_size=10)
        articles = []
        for article in response['articles']:
            try:
                resp = requests.get(article['url'], timeout=5)
                soup = BeautifulSoup(resp.text, 'html.parser')
                text = " ".join(p.get_text() for p in soup.find_all('p'))
                if text.strip():
                    articles.append(f"{article['title']} {text}")
            except requests.RequestException as e:
                print(f"Failed to fetch article {article.get('url', 'unknown')}: {e}")
                continue
        print(f"Fetched {len(articles)} articles for '{topic}'.")
        return articles
    except Exception as e:
        print(f"Error fetching news: {e}")
        return []

# Interactive workflow with robustness
while True:
    try:
        choice = input("Enter 'news' for news fetch or text to classify (or 'exit'): ").strip().lower()
        if choice == 'exit':
            break
        elif choice == 'news':
            topic = input("Enter news topic (e.g., Chandrayaan): ").strip()
            if topic:
                articles = fetch_news(topic)
                if articles:
                    labels = predict_batch(articles)
                    for text, label in zip(articles, labels):
                        print(f"Text: '{text[:50]}...' -> Predicted: {'Real' if label else 'Fake'}")
                else:
                    print("No articles fetched. Try another topic or check API key.")
        else:
            labels = predict_batch([choice])
            print(f"Input: '{choice[:50]}...' -> Predicted: {'Real' if labels[0] else 'Fake'}")
    except KeyboardInterrupt:
        print("\nExiting gracefully...")
        break
    except Exception as e:
        print(f"Workflow error: {e}. Continuing...")

print("Program terminated.")

Installing packages...



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: C:\Users\SJ\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Packages installed.


[nltk_data] Downloading package stopwords to C:\Users\SJ/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\SJ/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\SJ/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Loading model and tools...



https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Model, tokenizer, and scaler loaded successfully.
Input: 'the washington post tried to compare sen. elizabet...' -> Predicted: Fake
Input: 'the washington post tried to compare sen. elizabet...' -> Predicted: Fake
Input: 'in preparation for the scheduled 2018 launch of th...' -> Predicted: Fake
Input: 'posted on october 30, 2016 by sean adl-tabatabai i...' -> Predicted: Real
Input: '...' -> Predicted: Real
Input: 'donald trump...' -> Predicted: Real
Fetched 10 articles for 'donald trump'.
Text: 'The United States of Elon Musk Inc. Where do Elon ...' -> Predicted: Fake
Text: 'Donald Trump wants to delete ‘climate’ from federa...' -> Predicted: Fake
Text: 'Fired Democratic FTC commissioners sue Trump ﻿Kell...' -> Predicted: Fake
Text: 'Donald Trump Bought a $90,000 Tesla With 37 Recall...' -> Predicted: Fake
Text: 'The US Solar Power Industry Is Trying to Rebrand a...' -> Predicted: Fake
Text: 'People Are Paying Millions to Dine With Donald Tru...' -> Predicted: Fake
Text: 'Donald Trump 