In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Install required packages
print("Installing packages...")
!pip install swifter gensim newsapi-python -q
print("Packages installed.")

# Import libraries
import numpy as np
import pandas as pd
import os
import re
from multiprocessing import Pool, cpu_count
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(f"Found input file: {os.path.join(dirname, filename)}")

import re
import nltk
import zipfile
nltk.download('stopwords')
nltk.download('wordnet', download_dir='/kaggle/working/nltk_data')
nltk.download('vader_lexicon')
nltk.data.path.append('/kaggle/working/nltk_data')
print("Checking NLTK wordnet resource...")
wordnet_zip = '/kaggle/working/nltk_data/corpora/wordnet.zip'
if os.path.exists(wordnet_zip):
    print(f"Unzipping wordnet from {wordnet_zip}...")
    with zipfile.ZipFile(wordnet_zip, 'r') as zip_ref:
        zip_ref.extractall('/kaggle/working/nltk_data/corpora')
    os.remove(wordnet_zip)
    print("Wordnet unzipped and zip file removed.")
else:
    print("Wordnet already unzipped or not downloaded as zip.")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import swifter
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM, Dense, Dropout, Concatenate, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.mixed_precision import set_global_policy
from gensim.models import Word2Vec
import pickle
from IPython.display import FileLink
import gc

# Enable mixed precision and multi-GPU
set_global_policy('mixed_float16')
strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])
print(f"GPUs in use: {strategy.num_replicas_in_sync}")

# Load and clean dataset
print("Loading WELFake dataset...")
df = pd.read_csv("/kaggle/input/fake-news-classification/WELFake_Dataset.csv")
df = df.dropna(subset=['title', 'text', 'label'])
df['text'] = df['title'].astype(str) + " " + df['text'].astype(str)
print(f"Cleaned dataset shape: {df.shape}")

# Feature engineering with swifter
print("Engineering features...")
stop_words = set(stopwords.words('english')) - {'not'}
lemmatizer = WordNetLemmatizer()
sia = nltk.sentiment.vader.SentimentIntensityAnalyzer()

def count_words(text): return len(text.split()) if isinstance(text, str) else 0
def count_chars(text): return len(text) if isinstance(text, str) else 0
def uppercase_ratio(text): return sum(1 for c in text if c.isupper()) / len(text) if isinstance(text, str) and len(text) > 0 else 0
def punctuation_count(text): return sum(1 for c in text if c in string.punctuation) if isinstance(text, str) else 0
def sentiment_polarity(text): return sia.polarity_scores(text)['compound'] if isinstance(text, str) else 0

for col in ['title', 'text']:
    df[f'{col}_word_count'] = df[col].swifter.apply(count_words)
    df[f'{col}_char_count'] = df[col].swifter.apply(count_chars)
    df[f'{col}_uppercase_ratio'] = df[col].swifter.apply(uppercase_ratio)
    df[f'{col}_punctuation_count'] = df[col].swifter.apply(punctuation_count)
    df[f'{col}_sentiment'] = df[col].swifter.apply(sentiment_polarity)

numerical_features = ['title_word_count', 'text_word_count', 'title_char_count', 'text_char_count',
                      'title_uppercase_ratio', 'text_uppercase_ratio', 'title_punctuation_count',
                      'text_punctuation_count', 'title_sentiment', 'text_sentiment']
print("Features engineered.")

# Preprocessing with multiprocessing
def preprocess_text(text):
    if not isinstance(text, str) or not text.strip(): return ""
    text = re.sub(r"[^a-zA-Z\s]", " ", text.lower())
    words = text.split()
    return " ".join(lemmatizer.lemmatize(word) for word in words if word not in stop_words or word == 'not')

print("Preprocessing texts with multiprocessing...")
with Pool(cpu_count()) as pool:
    df['preprocessed_text'] = pool.map(preprocess_text, df['text'].tolist())
tokenized_texts = [text.split() for text in df['preprocessed_text'].tolist()]
print("Preprocessing completed.")

# Train-test split
X_texts = df['preprocessed_text'].tolist()
X_num = df[numerical_features].values
y = df['label'].values
X_train_texts, X_test_texts, X_train_num, X_test_num, y_train, y_test = train_test_split(
    X_texts, X_num, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train size: {len(X_train_texts)}, Test size: {len(X_test_texts)}")

# Export preprocessed data
print("Exporting preprocessed data...")
os.makedirs('/kaggle/working/output', exist_ok=True)
train_df = pd.DataFrame({'preprocessed_text': X_train_texts, 'label': y_train})
train_df[numerical_features] = X_train_num
test_df = pd.DataFrame({'preprocessed_text': X_test_texts, 'label': y_test})
test_df[numerical_features] = X_test_num
train_df.to_csv('/kaggle/working/output/train_preprocessed.csv', index=False)
test_df.to_csv('/kaggle/working/output/test_preprocessed.csv', index=False)
print("Datasets saved.")
display(FileLink('/kaggle/working/output/train_preprocessed.csv'))
display(FileLink('/kaggle/working/output/test_preprocessed.csv'))

# Tokenization and padding
max_num_words = 20000
max_sequence_length = 100
tokenizer = Tokenizer(num_words=max_num_words, lower=True)
tokenizer.fit_on_texts(X_train_texts)
X_train_seq = tokenizer.texts_to_sequences(X_train_texts)
X_test_seq = tokenizer.texts_to_sequences(X_test_texts)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)
word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")
with open('/kaggle/working/output/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
    
print("Tokenizer saved.")

# Normalize numerical features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_test_num = scaler.transform(X_test_num)
with open('/kaggle/working/output/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Scaler saved.")

# Train Word2Vec
print("Training Word2Vec...")
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=cpu_count(), sg=1, seed=42)
embedding_dim = 100
num_words = min(max_num_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_num_words: continue
    if word in w2v_model.wv: embedding_matrix[i] = w2v_model.wv[word]
print("Word2Vec trained.")

# Build model with multi-GPU
def build_model():
    text_input = Input(shape=(max_sequence_length,), name="text_input")
    embedding = Embedding(input_dim=num_words, output_dim=embedding_dim, weights=[embedding_matrix],
                          input_length=max_sequence_length, trainable=False)(text_input)
    x = SpatialDropout1D(0.2)(embedding)
    x = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(x)
    num_input = Input(shape=(X_train_num.shape[1],), name="num_input")
    num_x = Dense(32, activation="relu")(num_input)
    num_x = BatchNormalization()(num_x)
    num_x = Dropout(0.2)(num_x)
    merged = Concatenate()([x, num_x])
    output = Dense(1, activation="sigmoid", dtype='float32')(merged)
    return Model(inputs=[text_input, num_input], outputs=output)

with strategy.scope():
    model = build_model()
    model.compile(optimizer=Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Data pipeline
batch_size = 128 * strategy.num_replicas_in_sync
train_dataset = tf.data.Dataset.from_tensor_slices(((X_train_pad, X_train_num), y_train)).shuffle(10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices(((X_test_pad, X_test_num), y_test)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Train
print("Training model...")
history = model.fit(
    train_dataset, epochs=10,
    validation_data=test_dataset,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
               ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)],
    verbose=1
)
model.save('/kaggle/working/output/lstm_model.h5')
print("Model saved.")
display(FileLink('/kaggle/working/output/lstm_model.h5'))

# Evaluate
y_pred_prob = model.predict(test_dataset)
y_pred = (y_pred_prob >= 0.5).astype(int).flatten()
print(f"\nWord2Vec + LSTM Model Accuracy: {model.evaluate(test_dataset, verbose=0)[1]:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Clean up
del df, X_train_texts, X_test_texts, X_train_num, X_test_num, X_train_pad, X_test_pad, tokenized_texts
gc.collect()

Installing packages...
Packages installed.
Found input file: /kaggle/input/fake-news-classification/WELFake_Dataset.csv
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Checking NLTK wordnet resource...
Unzipping wordnet from /kaggle/working/nltk_data/corpora/wordnet.zip...
Wordnet unzipped and zip file removed.
GPUs in use: 2
Loading WELFake dataset...
Cleaned dataset shape: (71537, 4)
Engineering features...


Pandas Apply:   0%|          | 0/71537 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/71537 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/71537 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/71537 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/71537 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/71537 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/71537 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/71537 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/71537 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/71537 [00:00<?, ?it/s]

Features engineered.
Preprocessing texts with multiprocessing...
Preprocessing completed.
Train size: 57229, Test size: 14308
Exporting preprocessed data...
Datasets saved.


Found 173238 unique tokens.
Tokenizer saved.
Scaler saved.
Training Word2Vec...
Word2Vec trained.




Training model...
Epoch 1/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 177ms/step - accuracy: 0.7837 - loss: 0.4190 - val_accuracy: 0.9273 - val_loss: 0.1886 - learning_rate: 0.0010
Epoch 2/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 177ms/step - accuracy: 0.9190 - loss: 0.2025 - val_accuracy: 0.9241 - val_loss: 0.1816 - learning_rate: 0.0010
Epoch 3/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 176ms/step - accuracy: 0.9286 - loss: 0.1810 - val_accuracy: 0.9402 - val_loss: 0.1510 - learning_rate: 0.0010
Epoch 4/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 177ms/step - accuracy: 0.9345 - loss: 0.1645 - val_accuracy: 0.9455 - val_loss: 0.1448 - learning_rate: 0.0010
Epoch 5/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 178ms/step - accuracy: 0.9407 - loss: 0.1552 - val_accuracy: 0.9535 - val_loss: 0.1238 - learning_rate: 0.0010
Epoch 6/10
[1m224/224[0m [32m━━━━━

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 63ms/step

Word2Vec + LSTM Model Accuracy: 0.9596

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      7006
           1       0.97      0.95      0.96      7302

    accuracy                           0.96     14308
   macro avg       0.96      0.96      0.96     14308
weighted avg       0.96      0.96      0.96     14308



30528

In [3]:
import os
import zipfile
from IPython.display import FileLink

# Define the output directory and ZIP file name
output_dir = '/kaggle/working/output'
zip_filename = '/kaggle/working/output_files.zip'

# Function to zip the entire folder
def zip_folder(folder_path, output_zip):
    # Create a ZipFile object in write mode
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the directory
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Create the full file path
                file_path = os.path.join(root, file)
                # Calculate the relative path for the ZIP structure
                relative_path = os.path.relpath(file_path, folder_path)
                # Add the file to the ZIP with its relative path
                zipf.write(file_path, os.path.join(os.path.basename(folder_path), relative_path))
    print(f"Created ZIP file: {output_zip}")

# Check if the output directory exists and has files
if os.path.exists(output_dir) and os.listdir(output_dir):
    print(f"Zipping contents of {output_dir}...")
    zip_folder(output_dir, zip_filename)
    
    # Display a download link
    if os.path.exists(zip_filename):
        display(FileLink(zip_filename))
    else:
        print("Error: ZIP file was not created.")
else:
    print(f"Error: Directory {output_dir} is empty or does not exist.")

Zipping contents of /kaggle/working/output...
Created ZIP file: /kaggle/working/output_files.zip


In [None]:
# Install required packages
print("Installing packages...")
!pip install newsapi-python -q
print("Packages installed.")

# Import libraries
import numpy as np
import os
import re
from multiprocessing import Pool, cpu_count
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(f"Found input file: {os.path.join(dirname, filename)}")

import re
import nltk
import zipfile
nltk.download('stopwords')
nltk.download('wordnet', download_dir='/kaggle/working/nltk_data')
nltk.download('vader_lexicon')
nltk.data.path.append('/kaggle/working/nltk_data')
print("Checking NLTK wordnet resource...")
wordnet_zip = '/kaggle/working/nltk_data/corpora/wordnet.zip'
if os.path.exists(wordnet_zip):
    print(f"Unzipping wordnet from {wordnet_zip}...")
    with zipfile.ZipFile(wordnet_zip, 'r') as zip_ref:
        zip_ref.extractall('/kaggle/working/nltk_data/corpora')
    os.remove(wordnet_zip)
    print("Wordnet unzipped and zip file removed.")
else:
    print("Wordnet already unzipped or not downloaded as zip.")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import custom_object_scope
import pickle
from newsapi import NewsApiClient
import requests
from bs4 import BeautifulSoup
import string

# Define a minimal Cast layer as a fallback (only if needed)
class Cast(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Cast, self).__init__(**kwargs)
    
    def call(self, inputs):
        # Let TensorFlow infer dtype from inputs or model context
        return tf.cast(inputs, dtype=self.dtype_policy.compute_dtype)
    
    def get_config(self):
        return super(Cast, self).get_config()
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Robust file checking and loading
def load_file(file_path, load_func, desc):
    if os.path.exists(file_path):
        try:
            return load_func(file_path)
        except Exception as e:
            print(f"Error loading {desc} from {file_path}: {e}")
            return None
    else:
        print(f"{desc} file not found at {file_path}.")
        return None

# Load model and tools with error handling
print("Loading model and tools...")
model_path = '/kaggle/input/fake/tensorflow2/default/1/lstm_model.h5'
tokenizer_path = '/kaggle/input/fake/tensorflow2/default/1/tokenizer.pkl'
scaler_path = '/kaggle/input/fake/tensorflow2/default/1/scaler.pkl'

with custom_object_scope({'Cast': Cast}):
    model = load_file(model_path, lambda p: tf.keras.models.load_model(p), "Model")
tokenizer = load_file(tokenizer_path, lambda p: pickle.load(open(p, 'rb')), "Tokenizer")
scaler = load_file(scaler_path, lambda p: pickle.load(open(p, 'rb')), "Scaler")

if model is None or tokenizer is None or scaler is None:
    print("Critical files missing or corrupted. Please ensure training script ran successfully.")
    raise SystemExit(1)
print("Model, tokenizer, and scaler loaded successfully.")

# Constants
MAX_SEQUENCE_LENGTH = 100
NUM_FEATURES = 10
API_KEY = '20a033afa85e4b72af903562634d7f6d'  # Replace with your NewsAPI key
stop_words = set(stopwords.words('english')) - {'not'}
lemmatizer = WordNetLemmatizer()
sia = nltk.sentiment.vader.SentimentIntensityAnalyzer()

# Preprocessing and feature extraction with fallbacks
def preprocess_text(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    try:
        text = re.sub(r"[^a-zA-Z\s]", " ", text.lower())
        words = text.split()
        return " ".join(lemmatizer.lemmatize(word) for word in words if word not in stop_words or word == 'not')
    except Exception as e:
        print(f"Error preprocessing text: {e}")
        return ""

def extract_numerical_features(text):
    if not isinstance(text, str):
        return np.zeros(NUM_FEATURES)
    try:
        words = text.split()
        title = text[:50]  # Rough title approximation
        return np.array([
            len(title.split()), len(words), len(title), len(text),
            sum(1 for c in title if c.isupper()) / len(title) if len(title) > 0 else 0,
            sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0,
            sum(1 for c in title if c in string.punctuation), sum(1 for c in text if c in string.punctuation),
            sia.polarity_scores(title)['compound'], sia.polarity_scores(text)['compound']
        ])
    except Exception as e:
        print(f"Error extracting features: {e}")
        return np.zeros(NUM_FEATURES)

# Batch prediction with GPU and error handling
def predict_batch(texts):
    try:
        processed_texts = [preprocess_text(t) for t in texts]
        seqs = tokenizer.texts_to_sequences(processed_texts)
        padded_seqs = pad_sequences(seqs, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
        num_features = scaler.transform(np.array([extract_numerical_features(t) for t in texts]))
        with tf.device('/GPU:0'):
            preds = model.predict([padded_seqs, num_features], batch_size=256, verbose=0)
        return (preds >= 0.5).astype(int).flatten()
    except Exception as e:
        print(f"Prediction error: {e}")
        return np.zeros(len(texts), dtype=int)  # Fallback to all 'Fake' if prediction fails

# Fetch news with robust error handling
def fetch_news(topic):
    newsapi = NewsApiClient(api_key=API_KEY)
    try:
        response = newsapi.get_everything(q=topic, language='en', page_size=10)
        articles = []
        for article in response['articles']:
            try:
                resp = requests.get(article['url'], timeout=5)
                soup = BeautifulSoup(resp.text, 'html.parser')
                text = " ".join(p.get_text() for p in soup.find_all('p'))
                if text.strip():
                    articles.append(f"{article['title']} {text}")
            except requests.RequestException as e:
                print(f"Failed to fetch article {article.get('url', 'unknown')}: {e}")
                continue
        print(f"Fetched {len(articles)} articles for '{topic}'.")
        return articles
    except Exception as e:
        print(f"Error fetching news: {e}")
        return []

# Interactive workflow with robustness
while True:
    try:
        choice = input("Enter 'news' for news fetch or text to classify (or 'exit'): ").strip().lower()
        if choice == 'exit':
            break
        elif choice == 'news':
            topic = input("Enter news topic (e.g., Chandrayaan): ").strip()
            if topic:
                articles = fetch_news(topic)
                if articles:
                    labels = predict_batch(articles)
                    for text, label in zip(articles, labels):
                        print(f"Text: '{text[:50]}...' -> Predicted: {'Real' if label else 'Fake'}")
                else:
                    print("No articles fetched. Try another topic or check API key.")
        else:
            labels = predict_batch([choice])
            print(f"Input: '{choice[:50]}...' -> Predicted: {'Real' if labels[0] else 'Fake'}")
    except KeyboardInterrupt:
        print("\nExiting gracefully...")
        break
    except Exception as e:
        print(f"Workflow error: {e}. Continuing...")

print("Program terminated.")