In [4]:
# Final Fixed Fake News Detection System
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
import pickle
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    pass

class FinalFakeNewsDetector:
    def __init__(self):
        """Initialize the Fixed Fake News Detection System"""
        self.tfidf_vectorizer = None
        self.scaler = None
        self.models = {}
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.feature_names = []
        
    def create_expanded_dataset(self):
        """Create a larger, balanced dataset for better training"""
        
        # Fake news examples with clear patterns
        fake_news = [
            "BREAKING: Scientists HATE this one weird trick that cures cancer instantly!",
            "SHOCKING revelation: Government hiding alien technology for decades!",
            "You won't BELIEVE what doctors found in this man's stomach!",
            "URGENT: New study proves vaccines contain mind control chips!",
            "INCREDIBLE: Local mom discovers miracle weight loss secret!",
            "AMAZING: This simple trick will make you rich overnight!",
            "UNBELIEVABLE: Child predicts future with 100% accuracy!",
            "SECRET government plan to control weather revealed by whistleblower!",
            "MIRACLE cure found in grandmother's attic eliminates all diseases!",
            "TERRIFYING: 5G towers causing mass bird deaths worldwide!",
            "EXCLUSIVE: Celebrity admits to being controlled by illuminati!",
            "EXPOSED: Hospitals hiding cure for diabetes to make profits!",
            "STUNNING: Man lives 200 years using this one simple method!",
            "OUTRAGEOUS: Schools teaching children to worship Satan!",
            "BOMBSHELL: Water fluoridation linked to zombie outbreak!"
        ]
        
        # Real news examples with credible patterns
        real_news = [
            "Researchers at Stanford University published findings on renewable energy efficiency in peer-reviewed journal Nature.",
            "The Federal Reserve announced a 0.25% interest rate increase following comprehensive economic data analysis.",
            "Local hospital reports 15% decrease in COVID-19 cases over two-week period according to health officials.",
            "NASA's James Webb telescope captures detailed images of distant galaxy formation in unprecedented detail.",
            "City council approves $2.3 million budget for infrastructure improvements and public services enhancement.",
            "University study shows meditation may reduce stress levels in clinical trial with 200 participants.",
            "Environmental Protection Agency releases new guidelines for air quality monitoring in urban areas.",
            "Medical researchers report promising results in early-stage cancer treatment trials at Johns Hopkins.",
            "Economic indicators suggest moderate growth in manufacturing sector this quarter, analysts report.",
            "Scientists at MIT develop new battery technology for electric vehicle applications in laboratory setting.",
            "Health officials recommend updated vaccination schedules based on current epidemiological data.",
            "Archaeological team discovers 3,000-year-old artifacts in systematic excavation near ancient city.",
            "Climate research team publishes study on Arctic ice patterns in Journal of Climate Science.",
            "University hospital announces successful organ transplant program expansion with improved outcomes.",
            "Technology conference showcases advances in artificial intelligence applications for medical diagnosis."
        ]
        
        # Create balanced dataset
        texts = fake_news + real_news
        labels = [0] * len(fake_news) + [1] * len(real_news)  # 0 = Fake, 1 = Real
        
        df = pd.DataFrame({
            'text': texts,
            'label': labels
        })
        
        return df
    
    def preprocess_text(self, text):
        """Enhanced text preprocessing"""
        if pd.isna(text):
            return ""
            
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs, emails, and special patterns
        text = re.sub(r'http\S+|www.\S+', '', text)
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s!?.,]', '', text)
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords and stem (preserve some indicators)
        processed_tokens = []
        for token in tokens:
            if token not in self.stop_words or token in ['!', '?']:
                if token.isalpha():
                    processed_tokens.append(self.stemmer.stem(token))
                else:
                    processed_tokens.append(token)
        
        return ' '.join(processed_tokens)
    
    def extract_linguistic_features(self, text):
        """Extract comprehensive linguistic features"""
        features = {}
        
        if not text or len(text.strip()) == 0:
            # Return zero features for empty text
            feature_names = [
                'char_count', 'word_count', 'sentence_count', 'avg_word_length',
                'avg_sentence_length', 'exclamation_count', 'question_count',
                'capital_ratio', 'exclamation_ratio', 'emotional_word_count',
                'emotional_word_ratio', 'credibility_indicators', 'clickbait_count', 'caps_word_count'
            ]
            return {name: 0.0 for name in feature_names}
        
        # Basic text statistics
        words = text.split()
        sentences = sent_tokenize(text)
        
        features['char_count'] = float(len(text))
        features['word_count'] = float(len(words))
        features['sentence_count'] = float(max(len(sentences), 1))
        features['avg_word_length'] = float(np.mean([len(word) for word in words]) if words else 0)
        features['avg_sentence_length'] = float(len(words) / max(len(sentences), 1))
        
        # Punctuation and style features
        features['exclamation_count'] = float(text.count('!'))
        features['question_count'] = float(text.count('?'))
        features['capital_ratio'] = float(sum(1 for c in text if c.isupper()) / max(len(text), 1))
        features['exclamation_ratio'] = float(features['exclamation_count'] / max(len(sentences), 1))
        
        # Emotional and sensational language
        emotional_words = [
            'amazing', 'shocking', 'unbelievable', 'incredible', 'outrageous', 
            'stunning', 'terrifying', 'miracle', 'secret', 'exposed', 'breaking',
            'urgent', 'exclusive', 'revealed', 'discovered', 'bombshell'
        ]
        emotional_count = sum(1 for word in emotional_words if word in text.lower())
        features['emotional_word_count'] = float(emotional_count)
        features['emotional_word_ratio'] = float(emotional_count / max(len(words), 1))
        
        # Credibility indicators
        credible_phrases = [
            'according to', 'study shows', 'research indicates', 'scientists found',
            'university', 'published', 'peer-reviewed', 'clinical trial', 'data suggests'
        ]
        features['credibility_indicators'] = float(sum(1 for phrase in credible_phrases if phrase in text.lower()))
        
        # Clickbait indicators
        clickbait_words = ['you won\'t believe', 'doctors hate', 'one weird trick', 'this will shock']
        features['clickbait_count'] = float(sum(1 for phrase in clickbait_words if phrase in text.lower()))
        
        # All caps words (shouting)
        features['caps_word_count'] = float(sum(1 for word in words if word.isupper() and len(word) > 1))
        
        return features
    
    def prepare_features(self, df):
        """Prepare features ensuring non-negative values"""
        print("Preprocessing and extracting features...")
        
        # Preprocess text
        df['cleaned_text'] = df['text'].apply(self.preprocess_text)
        
        # Extract linguistic features
        linguistic_features = []
        for text in df['text']:
            features = self.extract_linguistic_features(text)
            linguistic_features.append(features)
        
        linguistic_df = pd.DataFrame(linguistic_features)
        self.feature_names = list(linguistic_df.columns)
        
        # Handle any NaN values
        linguistic_df = linguistic_df.fillna(0.0)
        
        # Ensure all linguistic features are non-negative (they should be by design)
        linguistic_df = linguistic_df.abs()
        
        print(f"Linguistic features shape: {linguistic_df.shape}")
        print(f"Linguistic features min: {linguistic_df.min().min()}")
        print(f"Linguistic features max: {linguistic_df.max().max()}")
        
        # TF-IDF vectorization
        if self.tfidf_vectorizer is None:
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=500,  # Reduced for stability
                ngram_range=(1, 2),
                min_df=1,
                max_df=0.95,
                stop_words='english'
            )
            tfidf_features = self.tfidf_vectorizer.fit_transform(df['cleaned_text'])
        else:
            tfidf_features = self.tfidf_vectorizer.transform(df['cleaned_text'])
        
        tfidf_dense = tfidf_features.toarray()
        print(f"TF-IDF features shape: {tfidf_dense.shape}")
        print(f"TF-IDF features min: {tfidf_dense.min()}")
        print(f"TF-IDF features max: {tfidf_dense.max()}")
        
        # Scale linguistic features to [0,1] range
        if self.scaler is None:
            self.scaler = MinMaxScaler()
            linguistic_scaled = self.scaler.fit_transform(linguistic_df.values)
        else:
            linguistic_scaled = self.scaler.transform(linguistic_df.values)
        
        print(f"Scaled linguistic features min: {linguistic_scaled.min()}")
        print(f"Scaled linguistic features max: {linguistic_scaled.max()}")
        
        # Combine features (both are now guaranteed to be non-negative)
        combined_features = np.hstack([tfidf_dense, linguistic_scaled])
        
        print(f"Combined features shape: {combined_features.shape}")
        print(f"Combined features min: {combined_features.min()}")
        print(f"Combined features max: {combined_features.max()}")
        
        # Final check for negative values
        if combined_features.min() < 0:
            print("WARNING: Negative values detected, applying absolute transformation")
            combined_features = np.abs(combined_features)
        
        return combined_features, df['label'].values
    
    def train_models(self, X_train, y_train):
        """Train models with error handling"""
        print("Training models...")
        
        # Models optimized for the dataset
        models = {
            'Naive Bayes': MultinomialNB(alpha=1.0),
            'Logistic Regression': LogisticRegression(
                random_state=42, 
                C=1.0,
                max_iter=1000,
                solver='liblinear'
            ),
            'Random Forest': RandomForestClassifier(
                n_estimators=50,
                random_state=42,
                max_depth=10,
                min_samples_split=2,
                min_samples_leaf=1
            )
        }
        
        # Train models with error handling
        for name, model in models.items():
            print(f"\nTraining {name}...")
            
            try:
                # Train model
                model.fit(X_train, y_train)
                self.models[name] = model
                
                # Quick accuracy check on training data
                train_pred = model.predict(X_train)
                train_acc = accuracy_score(y_train, train_pred)
                print(f"{name} training accuracy: {train_acc:.3f}")
                print(f"{name} training successful!")
                
            except Exception as e:
                print(f"Error training {name}: {str(e)}")
                continue
                
        print(f"\nModel training completed! Successfully trained {len(self.models)} models.")
        if len(self.models) == 0:
            print("ERROR: No models were successfully trained!")
            return False
        return True
    
    def evaluate_models(self, X_test, y_test):
        """Evaluate model performance"""
        print("\nEvaluating models on test set...")
        
        if not self.models:
            print("No models available for evaluation!")
            return {}
        
        results = {}
        
        for name, model in self.models.items():
            try:
                # Make predictions
                y_pred = model.predict(X_test)
                y_pred_proba = model.predict_proba(X_test)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred)
                
                results[name] = {
                    'accuracy': accuracy,
                    'predictions': y_pred,
                    'probabilities': y_pred_proba,
                    'classification_report': classification_report(y_test, y_pred, zero_division=0)
                }
                
                print(f"\n{name} Results:")
                print(f"Accuracy: {accuracy:.4f}")
                print("Classification Report:")
                print(classification_report(y_test, y_pred, zero_division=0))
                
                # Confusion Matrix
                cm = confusion_matrix(y_test, y_pred)
                print(f"Confusion Matrix:\n{cm}")
                
            except Exception as e:
                print(f"Error evaluating {name}: {str(e)}")
                continue
        
        return results
    
    def predict_single_article(self, text):
        """Make predictions on a single article"""
        if not self.models:
            return {"error": "No models available for prediction"}
        
        try:
            # Create temporary dataframe for preprocessing
            temp_df = pd.DataFrame({'text': [text], 'label': [0]})  # Dummy label
            
            # Prepare features
            X, _ = self.prepare_features(temp_df)
            
            # Get predictions from all models
            predictions = {}
            votes = {'Fake': 0, 'Real': 0}
            
            for name, model in self.models.items():
                try:
                    pred = model.predict(X)[0]
                    proba = model.predict_proba(X)[0]
                    
                    prediction_label = 'Real' if pred == 1 else 'Fake'
                    confidence = float(max(proba))
                    
                    predictions[name] = {
                        'prediction': prediction_label,
                        'confidence': confidence
                    }
                    
                    votes[prediction_label] += 1
                    
                except Exception as e:
                    print(f"Error in prediction with {name}: {str(e)}")
                    continue
            
            # Ensemble prediction
            if votes['Fake'] + votes['Real'] > 0:
                ensemble_prediction = 'Fake' if votes['Fake'] > votes['Real'] else 'Real'
                ensemble_confidence = float(max(votes.values()) / len(predictions))
                
                predictions['Ensemble'] = {
                    'prediction': ensemble_prediction,
                    'confidence': ensemble_confidence
                }
            
            return predictions
            
        except Exception as e:
            return {"error": f"Prediction failed: {str(e)}"}
    
    def save_model(self, filepath):
        """Save trained models and preprocessors"""
        model_data = {
            'models': self.models,
            'tfidf_vectorizer': self.tfidf_vectorizer,
            'scaler': self.scaler,
            'feature_names': self.feature_names
        }
        
        try:
            with open(filepath, 'wb') as f:
                pickle.dump(model_data, f)
            print(f"Models saved to {filepath}")
            return True
        except Exception as e:
            print(f"Error saving models: {str(e)}")
            return False
    
    def load_model(self, filepath):
        """Load pre-trained models"""
        try:
            with open(filepath, 'rb') as f:
                model_data = pickle.load(f)
            
            self.models = model_data['models']
            self.tfidf_vectorizer = model_data['tfidf_vectorizer']
            self.scaler = model_data['scaler']
            self.feature_names = model_data.get('feature_names', [])
            
            print(f"Models loaded from {filepath}")
            return True
        except Exception as e:
            print(f"Error loading models: {str(e)}")
            return False

def main():
    """Main execution with comprehensive error handling"""
    print("=== FINAL FIXED FAKE NEWS DETECTION SYSTEM ===\n")
    
    try:
        # Initialize detector
        detector = FinalFakeNewsDetector()
        
        # Create expanded dataset
        print("Creating expanded dataset...")
        df = detector.create_expanded_dataset()
        print(f"Dataset created: {len(df)} articles")
        print(f"Real news: {sum(df['label'])}, Fake news: {len(df) - sum(df['label'])}")
        
        # Prepare features
        X, y = detector.prepare_features(df)
        
        # Split data with stratification
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=0.3, 
            random_state=42, 
            stratify=y
        )
        
        print(f"\nData split completed:")
        print(f"Training set: {len(X_train)} samples")
        print(f"Test set: {len(X_test)} samples")
        print(f"Training labels: Fake={list(y_train).count(0)}, Real={list(y_train).count(1)}")
        print(f"Test labels: Fake={list(y_test).count(0)}, Real={list(y_test).count(1)}")
        
        # Train models
        success = detector.train_models(X_train, y_train)
        
        if not success:
            print("Training failed. Exiting...")
            return None
        
        # Evaluate models
        results = detector.evaluate_models(X_test, y_test)
        
        # Test with sample articles
        print("\n" + "="*60)
        print("TESTING WITH SAMPLE ARTICLES")
        print("="*60)
        
        test_articles = [
            "Scientists at MIT have developed a revolutionary new battery technology according to peer-reviewed research published in Nature journal.",
            "BREAKING: Government officials confirm alien contact, world leaders to make SHOCKING announcement tomorrow!!!",
            "Local hospital reports successful treatment outcomes with new COVID-19 therapy in controlled clinical trial."
        ]
        
        for i, article in enumerate(test_articles, 1):
            print(f"\nTest Article {i}: {article[:80]}...")
            predictions = detector.predict_single_article(article)
            
            if "error" in predictions:
                print(f"Error: {predictions['error']}")
            else:
                for model_name, result in predictions.items():
                    print(f"{model_name}: {result['prediction']} (Confidence: {result['confidence']:.3f})")
        
        # Save models
        detector.save_model('final_fake_news_detector.pkl')
        
        print("\n" + "="*60)
        print("SYSTEM TRAINING AND TESTING COMPLETE!")
        print("="*60)
        
        return detector
        
    except Exception as e:
        print(f"System error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

if __name__ == "__main__":
    detector = main()

=== FINAL FIXED FAKE NEWS DETECTION SYSTEM ===

Creating expanded dataset...
Dataset created: 30 articles
Real news: 15, Fake news: 15
Preprocessing and extracting features...
Linguistic features shape: (30, 14)
Linguistic features min: 0.0
Linguistic features max: 117.0
TF-IDF features shape: (30, 439)
TF-IDF features min: 0.0
TF-IDF features max: 0.42921607226022546
Scaled linguistic features min: 0.0
Scaled linguistic features max: 1.0
Combined features shape: (30, 453)
Combined features min: 0.0
Combined features max: 1.0

Data split completed:
Training set: 21 samples
Test set: 9 samples
Training labels: Fake=10, Real=11
Test labels: Fake=5, Real=4
Training models...

Training Naive Bayes...
Naive Bayes training accuracy: 1.000
Naive Bayes training successful!

Training Logistic Regression...
Logistic Regression training accuracy: 1.000
Logistic Regression training successful!

Training Random Forest...
Random Forest training accuracy: 1.000
Random Forest training successful!

Mod

In [6]:
# Enhanced Fake News Detection System v2.0
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import pickle
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import json
from textstat import flesch_reading_ease, flesch_kincaid_grade
from collections import Counter
import logging
import os
from pathlib import Path

warnings.filterwarnings('ignore')

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('vader_lexicon', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
except:
    pass

class EnhancedFakeNewsDetector:
    def __init__(self, config_file=None):
        """Initialize the Enhanced Fake News Detection System"""
        self.config = self._load_config(config_file)
        self.tfidf_vectorizer = None
        self.count_vectorizer = None
        self.scaler = None
        self.pca = None
        self.models = {}
        self.ensemble_model = None
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.feature_names = []
        self.feature_importance = {}
        self.training_history = []
        
        # Advanced feature extractors
        self.sentiment_analyzer = None
        self._init_sentiment_analyzer()
    
    def _load_config(self, config_file):
        """Load configuration settings"""
        default_config = {
            'tfidf_max_features': 1000,
            'tfidf_ngram_range': (1, 3),
            'use_pca': True,
            'pca_components': 0.95,
            'cross_validation_folds': 5,
            'random_state': 42,
            'test_size': 0.2,
            'min_word_length': 2,
            'max_word_length': 20
        }
        
        if config_file and os.path.exists(config_file):
            with open(config_file, 'r') as f:
                user_config = json.load(f)
            default_config.update(user_config)
        
        return default_config
    
    def _init_sentiment_analyzer(self):
        """Initialize sentiment analysis tools"""
        try:
            from nltk.sentiment import SentimentIntensityAnalyzer
            self.sentiment_analyzer = SentimentIntensityAnalyzer()
        except:
            logger.warning("VADER sentiment analyzer not available")
    
    def create_comprehensive_dataset(self):
        """Create a more comprehensive and diverse dataset"""
        
        # Expanded fake news examples with various patterns
        fake_news = [
            "BREAKING: Scientists HATE this one weird trick that cures cancer instantly!",
            "SHOCKING revelation: Government hiding alien technology for decades!",
            "You won't BELIEVE what doctors found in this man's stomach!",
            "URGENT: New study proves vaccines contain mind control chips!",
            "INCREDIBLE: Local mom discovers miracle weight loss secret!",
            "AMAZING: This simple trick will make you rich overnight!",
            "UNBELIEVABLE: Child predicts future with 100% accuracy!",
            "SECRET government plan to control weather revealed by whistleblower!",
            "MIRACLE cure found in grandmother's attic eliminates all diseases!",
            "TERRIFYING: 5G towers causing mass bird deaths worldwide!",
            "EXCLUSIVE: Celebrity admits to being controlled by illuminati!",
            "EXPOSED: Hospitals hiding cure for diabetes to make profits!",
            "STUNNING: Man lives 200 years using this one simple method!",
            "OUTRAGEOUS: Schools teaching children to worship Satan!",
            "BOMBSHELL: Water fluoridation linked to zombie outbreak!",
            "DOCTORS DON'T WANT YOU TO KNOW this ancient herbal remedy!",
            "VIRAL: Woman cures arthritis with kitchen spice, pharma companies furious!",
            "LEAKED: Government documents prove chemtrails are mind control experiments!",
            "FORBIDDEN knowledge: How to unlock your psychic powers in 3 days!",
            "CONSPIRACY: Major news outlets controlled by reptilian overlords!",
            "ULTIMATE secret to eternal youth discovered by 90-year-old farmer!",
            "MYSTERIOUS object found in Antarctica changes everything we know!",
            "APOCALYPSE warning: Planet X approaching Earth, NASA covers up!",
            "REVOLUTIONARY: Tap water turns toxic, only this filter can save you!",
            "INCREDIBLE discovery: Lost city of Atlantis found using Google Maps!"
        ]
        
        # Expanded real news examples with credible patterns
        real_news = [
            "Researchers at Stanford University published findings on renewable energy efficiency in peer-reviewed journal Nature.",
            "The Federal Reserve announced a 0.25% interest rate increase following comprehensive economic data analysis.",
            "Local hospital reports 15% decrease in COVID-19 cases over two-week period according to health officials.",
            "NASA's James Webb telescope captures detailed images of distant galaxy formation in unprecedented detail.",
            "City council approves $2.3 million budget for infrastructure improvements and public services enhancement.",
            "University study shows meditation may reduce stress levels in clinical trial with 200 participants.",
            "Environmental Protection Agency releases new guidelines for air quality monitoring in urban areas.",
            "Medical researchers report promising results in early-stage cancer treatment trials at Johns Hopkins.",
            "Economic indicators suggest moderate growth in manufacturing sector this quarter, analysts report.",
            "Scientists at MIT develop new battery technology for electric vehicle applications in laboratory setting.",
            "Health officials recommend updated vaccination schedules based on current epidemiological data.",
            "Archaeological team discovers 3,000-year-old artifacts in systematic excavation near ancient city.",
            "Climate research team publishes study on Arctic ice patterns in Journal of Climate Science.",
            "University hospital announces successful organ transplant program expansion with improved outcomes.",
            "Technology conference showcases advances in artificial intelligence applications for medical diagnosis.",
            "Federal Trade Commission investigates merger between two major telecommunications companies for antitrust concerns.",
            "International study involving 15 countries examines effectiveness of renewable energy policies over decade-long period.",
            "Department of Education releases standardized test scores showing mixed results across different demographic groups.",
            "Pharmaceutical company announces Phase III clinical trial results for new Alzheimer's treatment in medical journal.",
            "Metropolitan transportation authority approves funding for electric bus fleet expansion in urban areas.",
            "Agricultural researchers develop drought-resistant wheat varieties through selective breeding techniques at state university.",
            "Housing market analysis shows regional variations in home prices according to National Association of Realtors.",
            "Public health officials track seasonal flu patterns using data from hospital networks across multiple states.",
            "Engineering team at technical institute creates more efficient solar panel design through materials research.",
            "Central bank releases quarterly economic outlook based on employment statistics and inflation measurements."
        ]
        
        # Create balanced dataset
        texts = fake_news + real_news
        labels = [0] * len(fake_news) + [1] * len(real_news)  # 0 = Fake, 1 = Real
        
        df = pd.DataFrame({
            'text': texts,
            'label': labels
        })
        
        # Shuffle the dataset
        df = df.sample(frac=1, random_state=self.config['random_state']).reset_index(drop=True)
        
        logger.info(f"Dataset created: {len(df)} articles")
        logger.info(f"Real news: {sum(df['label'])}, Fake news: {len(df) - sum(df['label'])}")
        
        return df
    
    def advanced_text_preprocessing(self, text):
        """Enhanced text preprocessing with multiple techniques"""
        if pd.isna(text) or not isinstance(text, str):
            return ""
        
        # Store original for some features
        original_text = text
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs, emails, and social media handles
        text = re.sub(r'http\S+|www\.\S+|@\w+|#\w+', '', text)
        
        # Remove excessive punctuation but preserve some patterns
        text = re.sub(r'[!]{2,}', '!!', text)  # Normalize multiple exclamations
        text = re.sub(r'[?]{2,}', '??', text)  # Normalize multiple questions
        text = re.sub(r'\.{3,}', '...', text)  # Normalize ellipses
        
        # Remove special characters but keep important punctuation
        text = re.sub(r'[^\w\s!?.,;:\-\'"]', ' ', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Tokenize and filter
        tokens = word_tokenize(text)
        
        # Advanced filtering
        processed_tokens = []
        for token in tokens:
            # Filter by length
            if (len(token) >= self.config['min_word_length'] and 
                len(token) <= self.config['max_word_length']):
                
                if token.isalpha():
                    # Only stem if not a potential indicator word
                    if token not in ['breaking', 'urgent', 'shocking', 'amazing', 'incredible']:
                        if token not in self.stop_words:
                            processed_tokens.append(self.stemmer.stem(token))
                        elif token in ['not', 'no', 'never']:  # Keep important negations
                            processed_tokens.append(token)
                    else:
                        processed_tokens.append(token)
                elif token in ['!', '?', '!!', '??']:
                    processed_tokens.append(token)
        
        return ' '.join(processed_tokens)
    
    def extract_advanced_features(self, text, original_text=None):
        """Extract comprehensive linguistic and stylistic features"""
        if original_text is None:
            original_text = text
            
        features = {}
        
        if not text or len(text.strip()) == 0:
            # Return zero features for empty text
            feature_names = [
                'char_count', 'word_count', 'sentence_count', 'avg_word_length',
                'avg_sentence_length', 'exclamation_count', 'question_count',
                'capital_ratio', 'exclamation_ratio', 'emotional_word_count',
                'emotional_word_ratio', 'credibility_indicators', 'clickbait_count',
                'caps_word_count', 'reading_ease', 'reading_grade', 'sentiment_pos',
                'sentiment_neg', 'sentiment_neu', 'sentiment_compound', 'url_count',
                'number_count', 'quoted_text_ratio', 'first_person_count',
                'second_person_count', 'superlative_count', 'temporal_urgency',
                'all_caps_ratio', 'punctuation_density'
            ]
            return {name: 0.0 for name in feature_names}
        
        words = text.split()
        sentences = sent_tokenize(original_text)
        
        # Basic statistics
        features['char_count'] = float(len(original_text))
        features['word_count'] = float(len(words))
        features['sentence_count'] = float(max(len(sentences), 1))
        features['avg_word_length'] = float(np.mean([len(word) for word in words]) if words else 0)
        features['avg_sentence_length'] = float(len(words) / max(len(sentences), 1))
        
        # Punctuation and style
        features['exclamation_count'] = float(original_text.count('!'))
        features['question_count'] = float(original_text.count('?'))
        features['capital_ratio'] = float(sum(1 for c in original_text if c.isupper()) / max(len(original_text), 1))
        features['exclamation_ratio'] = float(features['exclamation_count'] / max(len(sentences), 1))
        
        # Emotional and sensational language (expanded)
        emotional_words = [
            'amazing', 'shocking', 'unbelievable', 'incredible', 'outrageous', 
            'stunning', 'terrifying', 'miracle', 'secret', 'exposed', 'breaking',
            'urgent', 'exclusive', 'revealed', 'discovered', 'bombshell', 'viral',
            'forbidden', 'ultimate', 'revolutionary', 'mysterious', 'leaked',
            'conspiracy', 'apocalypse', 'shocking', 'terrifying', 'incredible'
        ]
        emotional_count = sum(1 for word in emotional_words if word in text.lower())
        features['emotional_word_count'] = float(emotional_count)
        features['emotional_word_ratio'] = float(emotional_count / max(len(words), 1))
        
        # Credibility indicators (expanded)
        credible_phrases = [
            'according to', 'study shows', 'research indicates', 'scientists found',
            'university', 'published', 'peer-reviewed', 'clinical trial', 'data suggests',
            'researchers report', 'analysis reveals', 'evidence indicates', 'findings show',
            'journal', 'institute', 'department', 'official', 'government', 'agency'
        ]
        features['credibility_indicators'] = float(sum(1 for phrase in credible_phrases if phrase in text.lower()))
        
        # Clickbait indicators (expanded)
        clickbait_patterns = [
            'you won\'t believe', 'doctors hate', 'one weird trick', 'this will shock',
            'what happened next', 'the results will surprise', 'number 7 will amaze',
            'this simple trick', 'they don\'t want you to know', 'will blow your mind'
        ]
        features['clickbait_count'] = float(sum(1 for phrase in clickbait_patterns if phrase in text.lower()))
        
        # All caps words
        features['caps_word_count'] = float(sum(1 for word in words if word.isupper() and len(word) > 1))
        features['all_caps_ratio'] = float(features['caps_word_count'] / max(len(words), 1))
        
        # Readability scores
        try:
            features['reading_ease'] = float(flesch_reading_ease(original_text))
            features['reading_grade'] = float(flesch_kincaid_grade(original_text))
        except:
            features['reading_ease'] = 50.0  # Average readability
            features['reading_grade'] = 8.0   # 8th grade level
        
        # Sentiment analysis
        if self.sentiment_analyzer:
            try:
                scores = self.sentiment_analyzer.polarity_scores(original_text)
                features['sentiment_pos'] = float(scores['pos'])
                features['sentiment_neg'] = float(scores['neg'])
                features['sentiment_neu'] = float(scores['neu'])
                features['sentiment_compound'] = float(scores['compound'])
            except:
                features.update({
                    'sentiment_pos': 0.0, 'sentiment_neg': 0.0,
                    'sentiment_neu': 1.0, 'sentiment_compound': 0.0
                })
        else:
            features.update({
                'sentiment_pos': 0.0, 'sentiment_neg': 0.0,
                'sentiment_neu': 1.0, 'sentiment_compound': 0.0
            })
        
        # Additional features
        features['url_count'] = float(len(re.findall(r'http\S+|www\.\S+', original_text)))
        features['number_count'] = float(len(re.findall(r'\d+', original_text)))
        features['quoted_text_ratio'] = float(original_text.count('"') / max(len(original_text), 1))
        
        # Personal pronouns
        first_person = ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
        second_person = ['you', 'your', 'yours']
        features['first_person_count'] = float(sum(1 for word in words if word.lower() in first_person))
        features['second_person_count'] = float(sum(1 for word in words if word.lower() in second_person))
        
        # Superlatives
        superlatives = ['best', 'worst', 'most', 'least', 'greatest', 'biggest', 'smallest']
        features['superlative_count'] = float(sum(1 for word in words if word.lower() in superlatives))
        
        # Temporal urgency
        urgent_temporal = ['now', 'today', 'immediately', 'urgent', 'breaking', 'just in']
        features['temporal_urgency'] = float(sum(1 for phrase in urgent_temporal if phrase in text.lower()))
        
        # Punctuation density
        punctuation_chars = '!?.,;:'
        punctuation_count = sum(1 for char in original_text if char in punctuation_chars)
        features['punctuation_density'] = float(punctuation_count / max(len(original_text), 1))
        
        return features
    
    def prepare_enhanced_features(self, df):
        """Prepare comprehensive feature set with advanced techniques"""
        logger.info("Preprocessing and extracting enhanced features...")
        
        # Advanced preprocessing
        df['cleaned_text'] = df['text'].apply(self.advanced_text_preprocessing)
        
        # Extract advanced linguistic features
        linguistic_features = []
        for idx, row in df.iterrows():
            features = self.extract_advanced_features(row['cleaned_text'], row['text'])
            linguistic_features.append(features)
        
        linguistic_df = pd.DataFrame(linguistic_features)
        self.feature_names = list(linguistic_df.columns)
        
        # Handle NaN values
        linguistic_df = linguistic_df.fillna(0.0)
        
        # Robust scaling for linguistic features
        if self.scaler is None:
            self.scaler = RobustScaler()
            linguistic_scaled = self.scaler.fit_transform(linguistic_df.values)
        else:
            linguistic_scaled = self.scaler.transform(linguistic_df.values)
        
        # TF-IDF vectorization
        if self.tfidf_vectorizer is None:
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=self.config['tfidf_max_features'],
                ngram_range=self.config['tfidf_ngram_range'],
                min_df=2,
                max_df=0.8,
                stop_words='english',
                sublinear_tf=True,
                norm='l2'
            )
            tfidf_features = self.tfidf_vectorizer.fit_transform(df['cleaned_text'])
        else:
            tfidf_features = self.tfidf_vectorizer.transform(df['cleaned_text'])
        
        tfidf_dense = tfidf_features.toarray()
        
        # Count vectorization for additional features
        if self.count_vectorizer is None:
            self.count_vectorizer = CountVectorizer(
                max_features=500,
                ngram_range=(1, 2),
                min_df=2,
                max_df=0.8,
                stop_words='english'
            )
            count_features = self.count_vectorizer.fit_transform(df['cleaned_text'])
        else:
            count_features = self.count_vectorizer.transform(df['cleaned_text'])
        
        count_dense = count_features.toarray()
        
        # Combine all features
        combined_features = np.hstack([tfidf_dense, count_dense, linguistic_scaled])
        
        # Apply PCA for dimensionality reduction if configured
        if self.config['use_pca']:
            if self.pca is None:
                self.pca = PCA(
                    n_components=self.config['pca_components'],
                    random_state=self.config['random_state']
                )
                combined_features = self.pca.fit_transform(combined_features)
            else:
                combined_features = self.pca.transform(combined_features)
            
            logger.info(f"PCA applied: {combined_features.shape[1]} components retained")
        
        logger.info(f"Final feature shape: {combined_features.shape}")
        
        return combined_features, df['label'].values
    
    def create_advanced_models(self):
        """Create advanced model ensemble with hyperparameter tuning"""
        models = {
            'Multinomial_NB': MultinomialNB(),
            'Logistic_Regression': LogisticRegression(
                random_state=self.config['random_state'],
                max_iter=2000,
                class_weight='balanced'
            ),
            'Random_Forest': RandomForestClassifier(
                n_estimators=100,
                random_state=self.config['random_state'],
                class_weight='balanced',
                max_depth=15
            ),
            'SVM': SVC(
                random_state=self.config['random_state'],
                class_weight='balanced',
                probability=True,
                kernel='rbf'
            ),
            'Gradient_Boosting': GradientBoostingClassifier(
                n_estimators=100,
                random_state=self.config['random_state'],
                learning_rate=0.1
            )
        }
        
        return models
    
    def train_with_cross_validation(self, X_train, y_train):
        """Train models with cross-validation and hyperparameter tuning"""
        logger.info("Training models with cross-validation...")
        
        models = self.create_advanced_models()
        
        for name, model in models.items():
            logger.info(f"Training {name}...")
            
            try:
                # Cross-validation
                cv_scores = cross_val_score(
                    model, X_train, y_train,
                    cv=self.config['cross_validation_folds'],
                    scoring='accuracy',
                    n_jobs=-1
                )
                
                # Train final model
                model.fit(X_train, y_train)
                self.models[name] = model
                
                # Store training metrics
                train_accuracy = accuracy_score(y_train, model.predict(X_train))
                
                training_info = {
                    'model': name,
                    'cv_mean': cv_scores.mean(),
                    'cv_std': cv_scores.std(),
                    'train_accuracy': train_accuracy
                }
                self.training_history.append(training_info)
                
                logger.info(f"{name} - CV: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
                
            except Exception as e:
                logger.error(f"Error training {name}: {str(e)}")
                continue
        
        # Create ensemble model
        if len(self.models) >= 3:
            self._create_ensemble_model()
        
        return len(self.models) > 0
    
    def _create_ensemble_model(self):
        """Create voting ensemble from trained models"""
        try:
            estimators = [(name, model) for name, model in self.models.items()]
            
            self.ensemble_model = VotingClassifier(
                estimators=estimators,
                voting='soft'
            )
            
            # Note: ensemble is already "fitted" since individual models are fitted
            logger.info("Ensemble model created successfully")
            
        except Exception as e:
            logger.error(f"Error creating ensemble: {str(e)}")
    
    def evaluate_comprehensive(self, X_test, y_test):
        """Comprehensive model evaluation with multiple metrics"""
        logger.info("Comprehensive model evaluation...")
        
        if not self.models:
            logger.error("No models available for evaluation!")
            return {}
        
        results = {}
        
        for name, model in self.models.items():
            try:
                y_pred = model.predict(X_test)
                y_pred_proba = model.predict_proba(X_test)[:, 1]
                
                # Multiple metrics
                accuracy = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)
                auc_score = roc_auc_score(y_test, y_pred_proba)
                
                results[name] = {
                    'accuracy': accuracy,
                    'f1_score': f1,
                    'auc_score': auc_score,
                    'predictions': y_pred,
                    'probabilities': y_pred_proba,
                    'classification_report': classification_report(y_test, y_pred, zero_division=0)
                }
                
                logger.info(f"{name} - Accuracy: {accuracy:.4f}, F1: {f1:.4f}, AUC: {auc_score:.4f}")
                
            except Exception as e:
                logger.error(f"Error evaluating {name}: {str(e)}")
                continue
        
        # Evaluate ensemble if available
        if self.ensemble_model:
            try:
                y_pred = self.ensemble_model.predict(X_test)
                y_pred_proba = self.ensemble_model.predict_proba(X_test)[:, 1]
                
                accuracy = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)
                auc_score = roc_auc_score(y_test, y_pred_proba)
                
                results['Ensemble'] = {
                    'accuracy': accuracy,
                    'f1_score': f1,
                    'auc_score': auc_score,
                    'predictions': y_pred,
                    'probabilities': y_pred_proba,
                    'classification_report': classification_report(y_test, y_pred, zero_division=0)
                }
                
                logger.info(f"Ensemble - Accuracy: {accuracy:.4f}, F1: {f1:.4f}, AUC: {auc_score:.4f}")
                
            except Exception as e:
                logger.error(f"Error evaluating ensemble: {str(e)}")
        
        return results
    
    def predict_with_confidence(self, text):
        """Make predictions with confidence intervals and explanations"""
        if not self.models:
            return {"error": "No models available for prediction"}
        
        try:
            # Prepare features
            temp_df = pd.DataFrame({'text': [text], 'label': [0]})
            X, _ = self.prepare_enhanced_features(temp_df)
            
            predictions = {}
            all_probabilities = []
            
            # Individual model predictions
            for name, model in self.models.items():
                try:
                    pred = model.predict(X)[0]
                    proba = model.predict_proba(X)[0]
                    
                    prediction_label = 'Real' if pred == 1 else 'Fake'
                    confidence = float(max(proba))
                    fake_prob = float(proba[0])
                    real_prob = float(proba[1])
                    
                    predictions[name] = {
                        'prediction': prediction_label,
                        'confidence': confidence,
                        'fake_probability': fake_prob,
                        'real_probability': real_prob
                    }
                    
                    all_probabilities.append([fake_prob, real_prob])
                    
                except Exception as e:
                    logger.error(f"Error in prediction with {name}: {str(e)}")
                    continue
            
            # Ensemble prediction if available
            if self.ensemble_model:
                try:
                    pred = self.ensemble_model.predict(X)[0]
                    proba = self.ensemble_model.predict_proba(X)[0]
                    
                    prediction_label = 'Real' if pred == 1 else 'Fake'
                    confidence = float(max(proba))
                    fake_prob = float(proba[0])
                    real_prob = float(proba[1])
                    
                    predictions['Ensemble'] = {
                        'prediction': prediction_label,
                        'confidence': confidence,
                        'fake_probability': fake_prob,
                        'real_probability': real_prob
                    }
                    
                except Exception as e:
                    logger.error(f"Error in ensemble prediction: {str(e)}")
            
            # Calculate consensus
            if all_probabilities:
                avg_probs = np.mean(all_probabilities, axis=0)
                consensus_pred = 'Real' if avg_probs[1] > avg_probs[0] else 'Fake'
                consensus_confidence = float(max(avg_probs))
                
                predictions['Consensus'] = {
                    'prediction': consensus_pred,
                    'confidence': consensus_confidence,
                    'fake_probability': float(avg_probs[0]),
                    'real_probability': float(avg_probs[1])
                }
            
            # Add feature analysis
            try:
                features = self.extract_advanced_features(
                    self.advanced_text_preprocessing(text), text
                )
                predictions['feature_analysis'] = self._analyze_key_features(features)
            except:
                pass
            
            return predictions
            
        except Exception as e:
            return {"error": f"Prediction failed: {str(e)}"}
    
    def _analyze_key_features(self, features):
        """Analyze key features that influence the prediction"""
        analysis = {}
        
        # Analyze key indicators
        if features['emotional_word_ratio'] > 0.1:
            analysis['high_emotional_content'] = True
        
        if features['clickbait_count'] > 0:
            analysis['clickbait_detected'] = True
        
        if features['credibility_indicators'] > 2:
            analysis['high_credibility_indicators'] = True
        
        if features['caps_word_count'] > 3:
            analysis['excessive_capitalization'] = True
        
        if features['exclamation_ratio'] > 1:
            analysis['excessive_exclamations'] = True
        
        # Readability analysis
        if features['reading_ease'] < 30:
            analysis['very_difficult_to_read'] = True
        elif features['reading_ease'] > 90:
            analysis['very_easy_to_read'] = True
        
        # Sentiment analysis
        if features['sentiment_compound'] > 0.5:
            analysis['highly_positive_sentiment'] = True
        elif features['sentiment_compound'] < -0.5:
            analysis['highly_negative_sentiment'] = True
        
        return analysis
    
    def generate_explanation(self, text, predictions):
        """Generate human-readable explanation for the prediction"""
        try:
            explanation = {
                'summary': '',
                'key_indicators': [],
                'confidence_level': '',
                'recommendation': ''
            }
            
            # Get consensus prediction
            if 'Consensus' in predictions:
                main_pred = predictions['Consensus']['prediction']
                main_conf = predictions['Consensus']['confidence']
            elif 'Ensemble' in predictions:
                main_pred = predictions['Ensemble']['prediction']
                main_conf = predictions['Ensemble']['confidence']
            else:
                # Use most common prediction
                preds = [p['prediction'] for p in predictions.values() if 'prediction' in p]
                main_pred = max(set(preds), key=preds.count) if preds else 'Unknown'
                main_conf = 0.5
            
            # Summary
            explanation['summary'] = f"This article is classified as {main_pred.upper()} news with {main_conf:.1%} confidence."
            
            # Confidence level
            if main_conf >= 0.9:
                explanation['confidence_level'] = 'Very High'
            elif main_conf >= 0.75:
                explanation['confidence_level'] = 'High'
            elif main_conf >= 0.6:
                explanation['confidence_level'] = 'Moderate'
            else:
                explanation['confidence_level'] = 'Low'
            
            # Key indicators from feature analysis
            if 'feature_analysis' in predictions:
                analysis = predictions['feature_analysis']
                
                if analysis.get('high_emotional_content'):
                    explanation['key_indicators'].append("Contains high emotional/sensational language")
                
                if analysis.get('clickbait_detected'):
                    explanation['key_indicators'].append("Shows clickbait patterns")
                
                if analysis.get('high_credibility_indicators'):
                    explanation['key_indicators'].append("Contains credible source references")
                
                if analysis.get('excessive_capitalization'):
                    explanation['key_indicators'].append("Uses excessive capitalization")
                
                if analysis.get('excessive_exclamations'):
                    explanation['key_indicators'].append("Contains excessive exclamation marks")
            
            # Recommendation
            if main_pred == 'Fake' and main_conf > 0.7:
                explanation['recommendation'] = "Exercise extreme caution. Verify through multiple reliable sources before sharing."
            elif main_pred == 'Fake':
                explanation['recommendation'] = "Be skeptical. Cross-check with established news sources."
            elif main_pred == 'Real' and main_conf > 0.8:
                explanation['recommendation'] = "Appears to be legitimate news, but still verify if sharing important information."
            else:
                explanation['recommendation'] = "Uncertain classification. Verify through reliable sources."
            
            return explanation
            
        except Exception as e:
            return {"error": f"Could not generate explanation: {str(e)}"}
    
    def save_enhanced_model(self, filepath):
        """Save all components of the enhanced model"""
        model_data = {
            'models': self.models,
            'ensemble_model': self.ensemble_model,
            'tfidf_vectorizer': self.tfidf_vectorizer,
            'count_vectorizer': self.count_vectorizer,
            'scaler': self.scaler,
            'pca': self.pca,
            'feature_names': self.feature_names,
            'config': self.config,
            'training_history': self.training_history,
            'feature_importance': self.feature_importance
        }
        
        try:
            # Create directory if it doesn't exist
            Path(filepath).parent.mkdir(parents=True, exist_ok=True)
            
            with open(filepath, 'wb') as f:
                pickle.dump(model_data, f, protocol=pickle.HIGHEST_PROTOCOL)
            
            logger.info(f"Enhanced model saved to {filepath}")
            
            # Save metadata separately
            metadata_path = filepath.replace('.pkl', '_metadata.json')
            metadata = {
                'timestamp': datetime.now().isoformat(),
                'model_count': len(self.models),
                'feature_count': len(self.feature_names),
                'config': self.config,
                'training_history': self.training_history
            }
            
            with open(metadata_path, 'w') as f:
                json.dump(metadata, f, indent=2, default=str)
            
            return True
            
        except Exception as e:
            logger.error(f"Error saving enhanced model: {str(e)}")
            return False
    
    def load_enhanced_model(self, filepath):
        """Load the enhanced model with all components"""
        try:
            with open(filepath, 'rb') as f:
                model_data = pickle.load(f)
            
            self.models = model_data.get('models', {})
            self.ensemble_model = model_data.get('ensemble_model')
            self.tfidf_vectorizer = model_data.get('tfidf_vectorizer')
            self.count_vectorizer = model_data.get('count_vectorizer')
            self.scaler = model_data.get('scaler')
            self.pca = model_data.get('pca')
            self.feature_names = model_data.get('feature_names', [])
            self.config.update(model_data.get('config', {}))
            self.training_history = model_data.get('training_history', [])
            self.feature_importance = model_data.get('feature_importance', {})
            
            logger.info(f"Enhanced model loaded from {filepath}")
            logger.info(f"Loaded {len(self.models)} individual models")
            
            return True
            
        except Exception as e:
            logger.error(f"Error loading enhanced model: {str(e)}")
            return False
    
    def create_visualization_report(self, results, output_dir='reports'):
        """Create comprehensive visualization report"""
        try:
            Path(output_dir).mkdir(parents=True, exist_ok=True)
            
            # Model comparison plot
            fig, axes = plt.subplots(2, 2, figsize=(15, 12))
            
            # Accuracy comparison
            models = list(results.keys())
            accuracies = [results[model]['accuracy'] for model in models]
            
            axes[0, 0].bar(models, accuracies, color='skyblue')
            axes[0, 0].set_title('Model Accuracy Comparison')
            axes[0, 0].set_ylabel('Accuracy')
            axes[0, 0].tick_params(axis='x', rotation=45)
            
            # F1-Score comparison
            f1_scores = [results[model]['f1_score'] for model in models]
            axes[0, 1].bar(models, f1_scores, color='lightcoral')
            axes[0, 1].set_title('Model F1-Score Comparison')
            axes[0, 1].set_ylabel('F1-Score')
            axes[0, 1].tick_params(axis='x', rotation=45)
            
            # AUC comparison
            auc_scores = [results[model]['auc_score'] for model in models]
            axes[1, 0].bar(models, auc_scores, color='lightgreen')
            axes[1, 0].set_title('Model AUC Score Comparison')
            axes[1, 0].set_ylabel('AUC Score')
            axes[1, 0].tick_params(axis='x', rotation=45)
            
            # Training history (if available)
            if self.training_history:
                cv_means = [item['cv_mean'] for item in self.training_history]
                cv_stds = [item['cv_std'] for item in self.training_history]
                model_names = [item['model'] for item in self.training_history]
                
                axes[1, 1].errorbar(range(len(model_names)), cv_means, yerr=cv_stds, 
                                  marker='o', capsize=5, capthick=2)
                axes[1, 1].set_title('Cross-Validation Scores')
                axes[1, 1].set_ylabel('CV Score')
                axes[1, 1].set_xticks(range(len(model_names)))
                axes[1, 1].set_xticklabels(model_names, rotation=45)
            
            plt.tight_layout()
            plt.savefig(f'{output_dir}/model_comparison.png', dpi=300, bbox_inches='tight')
            plt.close()
            
            # Generate HTML report
            self._generate_html_report(results, output_dir)
            
            logger.info(f"Visualization report saved to {output_dir}/")
            return True
            
        except Exception as e:
            logger.error(f"Error creating visualization report: {str(e)}")
            return False
    
    def _generate_html_report(self, results, output_dir):
        """Generate HTML report with detailed analysis"""
        html_content = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>Fake News Detection Report</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 40px; }}
                .header {{ background-color: #f0f8ff; padding: 20px; border-radius: 10px; }}
                .model-results {{ margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }}
                .metric {{ display: inline-block; margin: 10px; padding: 10px; background-color: #f9f9f9; border-radius: 5px; }}
                .timestamp {{ color: #666; font-style: italic; }}
            </style>
        </head>
        <body>
            <div class="header">
                <h1>Enhanced Fake News Detection System Report</h1>
                <p class="timestamp">Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
            </div>
            
            <h2>Model Performance Summary</h2>
        """
        
        for model_name, result in results.items():
            html_content += f"""
            <div class="model-results">
                <h3>{model_name}</h3>
                <div class="metric">
                    <strong>Accuracy:</strong> {result['accuracy']:.4f}
                </div>
                <div class="metric">
                    <strong>F1-Score:</strong> {result['f1_score']:.4f}
                </div>
                <div class="metric">
                    <strong>AUC Score:</strong> {result['auc_score']:.4f}
                </div>
            </div>
            """
        
        html_content += """
            <h2>Training Configuration</h2>
            <ul>
        """
        
        for key, value in self.config.items():
            html_content += f"<li><strong>{key}:</strong> {value}</li>"
        
        html_content += """
            </ul>
            </body>
            </html>
        """
        
        with open(f'{output_dir}/report.html', 'w') as f:
            f.write(html_content)

def main_enhanced():
    """Enhanced main execution with comprehensive features"""
    print("=" * 60)
    print("ENHANCED FAKE NEWS DETECTION SYSTEM V2.0")
    print("=" * 60)
    
    try:
        # Initialize enhanced detector
        detector = EnhancedFakeNewsDetector()
        
        # Create comprehensive dataset
        logger.info("Creating comprehensive dataset...")
        df = detector.create_comprehensive_dataset()
        
        # Prepare enhanced features
        X, y = detector.prepare_enhanced_features(df)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=detector.config['test_size'],
            random_state=detector.config['random_state'],
            stratify=y
        )
        
        logger.info(f"Training set: {len(X_train)} samples")
        logger.info(f"Test set: {len(X_test)} samples")
        
        # Train with cross-validation
        success = detector.train_with_cross_validation(X_train, y_train)
        
        if not success:
            logger.error("Training failed. Exiting...")
            return None
        
        # Comprehensive evaluation
        results = detector.evaluate_comprehensive(X_test, y_test)
        
        # Create visualization report
        detector.create_visualization_report(results)
        
        # Test with enhanced predictions
        print("\n" + "="*60)
        print("TESTING WITH ENHANCED PREDICTIONS")
        print("="*60)
        
        test_articles = [
            "Scientists at MIT have developed a revolutionary new battery technology according to peer-reviewed research published in Nature journal.",
            "BREAKING: Government officials confirm alien contact, world leaders to make SHOCKING announcement tomorrow!!!",
            "Local hospital reports successful treatment outcomes with new COVID-19 therapy in controlled clinical trial.",
            "DOCTORS DON'T WANT YOU TO KNOW this ancient herbal remedy cures everything instantly!",
            "Federal Reserve announces interest rate changes following comprehensive economic analysis by board members."
        ]
        
        for i, article in enumerate(test_articles, 1):
            print(f"\nTest Article {i}: {article[:80]}...")
            predictions = detector.predict_with_confidence(article)
            explanation = detector.generate_explanation(article, predictions)
            
            if "error" in predictions:
                print(f"Error: {predictions['error']}")
            else:
                print("\nPredictions:")
                for model_name, result in predictions.items():
                    if 'prediction' in result:
                        print(f"  {model_name}: {result['prediction']} "
                              f"(Confidence: {result['confidence']:.3f})")
                
                print(f"\nExplanation: {explanation.get('summary', 'N/A')}")
                print(f"Confidence Level: {explanation.get('confidence_level', 'N/A')}")
                if explanation.get('key_indicators'):
                    print("Key Indicators:")
                    for indicator in explanation['key_indicators']:
                        print(f"  - {indicator}")
                print(f"Recommendation: {explanation.get('recommendation', 'N/A')}")
        
        # Save enhanced model
        detector.save_enhanced_model('enhanced_fake_news_detector_v2.pkl')
        
        print("\n" + "="*60)
        print("ENHANCED SYSTEM TRAINING AND TESTING COMPLETE!")
        print("="*60)
        
        return detector
        
    except Exception as e:
        logger.error(f"System error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

if __name__ == "__main__":
    detector = main_enhanced()

2025-09-06 23:12:36,601 - INFO - Creating comprehensive dataset...
2025-09-06 23:12:36,606 - INFO - Dataset created: 50 articles
2025-09-06 23:12:36,607 - INFO - Real news: 25, Fake news: 25
2025-09-06 23:12:36,609 - INFO - Preprocessing and extracting enhanced features...


ENHANCED FAKE NEWS DETECTION SYSTEM V2.0


2025-09-06 23:12:38,999 - INFO - PCA applied: 30 components retained
2025-09-06 23:12:39,000 - INFO - Final feature shape: (50, 30)
2025-09-06 23:12:39,002 - INFO - Training set: 40 samples
2025-09-06 23:12:39,003 - INFO - Test set: 10 samples
2025-09-06 23:12:39,004 - INFO - Training models with cross-validation...
2025-09-06 23:12:39,005 - INFO - Training Multinomial_NB...
2025-09-06 23:12:44,255 - ERROR - Error training Multinomial_NB: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\admin\anaconda3\envs\Gen_AI\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\admin\anaconda3\env


TESTING WITH ENHANCED PREDICTIONS

Test Article 1: Scientists at MIT have developed a revolutionary new battery technology accordin...

Predictions:
  Logistic_Regression: Real (Confidence: 0.992)
  Random_Forest: Real (Confidence: 0.710)
  SVM: Real (Confidence: 0.984)
  Gradient_Boosting: Real (Confidence: 1.000)
  Consensus: Real (Confidence: 0.921)

Explanation: This article is classified as REAL news with 92.1% confidence.
Confidence Level: Very High
Recommendation: Appears to be legitimate news, but still verify if sharing important information.

Test Article 2: BREAKING: Government officials confirm alien contact, world leaders to make SHOC...

Predictions:
  Logistic_Regression: Fake (Confidence: 0.990)
  Random_Forest: Fake (Confidence: 0.760)
  SVM: Fake (Confidence: 0.980)
  Gradient_Boosting: Fake (Confidence: 1.000)
  Consensus: Fake (Confidence: 0.933)

Explanation: This article is classified as FAKE news with 93.3% confidence.
Confidence Level: Very High
Key Indicators:

In [1]:
# Step 1: Install required packages and run the enhanced system
# Copy this code and run it in your Python environment

# First, install additional required packages:
"""
pip install textstat
pip install matplotlib
pip install seaborn
"""

# Then run the enhanced system
from enhanced_fake_news_detector_v2 import main_enhanced

if __name__ == "__main__":
    print("🚀 Running Enhanced Fake News Detection System v2.0...")
    detector = main_enhanced()
    
    if detector:
        print("\n✅ System initialized successfully!")
        print("📊 Check the 'reports' folder for visualization reports")
        print("💾 Model saved as 'enhanced_fake_news_detector_v2.pkl'")
    else:
        print("❌ System failed to initialize")

2025-09-06 23:53:33,350 - INFO - Creating comprehensive dataset...
2025-09-06 23:53:33,353 - INFO - Dataset created: 50 articles
2025-09-06 23:53:33,354 - INFO - Real news: 25, Fake news: 25
2025-09-06 23:53:33,355 - INFO - Preprocessing and extracting enhanced features...


🚀 Running Enhanced Fake News Detection System v2.0...
ENHANCED FAKE NEWS DETECTION SYSTEM V2.0


2025-09-06 23:53:34,314 - INFO - PCA applied: 30 components retained
2025-09-06 23:53:34,315 - INFO - Final feature shape: (50, 30)
2025-09-06 23:53:34,318 - INFO - Training set: 40 samples
2025-09-06 23:53:34,319 - INFO - Test set: 10 samples
2025-09-06 23:53:34,320 - INFO - Training models with cross-validation...
2025-09-06 23:53:34,321 - INFO - Training Multinomial_NB...
2025-09-06 23:53:41,131 - ERROR - Error training Multinomial_NB: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\admin\anaconda3\envs\Gen_AI\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\admin\anaconda3\env


TESTING WITH ENHANCED PREDICTIONS

Test Article 1: Scientists at MIT have developed a revolutionary new battery technology accordin...

Predictions:
  Logistic_Regression: Real (Confidence: 0.992)
  Random_Forest: Real (Confidence: 0.710)
  SVM: Real (Confidence: 0.984)
  Gradient_Boosting: Real (Confidence: 1.000)
  Consensus: Real (Confidence: 0.921)

Explanation: This article is classified as REAL news with 92.1% confidence.
Confidence Level: Very High
Recommendation: Appears to be legitimate news, but still verify if sharing important information.

Test Article 2: BREAKING: Government officials confirm alien contact, world leaders to make SHOC...

Predictions:
  Logistic_Regression: Fake (Confidence: 0.990)
  Random_Forest: Fake (Confidence: 0.760)
  SVM: Fake (Confidence: 0.980)
  Gradient_Boosting: Fake (Confidence: 1.000)
  Consensus: Fake (Confidence: 0.933)

Explanation: This article is classified as FAKE news with 93.3% confidence.
Confidence Level: Very High
Key Indicators:

In [3]:
# Step 2: Test the enhanced system with real articles
from enhanced_fake_news_detector_v2 import EnhancedFakeNewsDetector
import json

def test_with_real_articles():
    """Test the system with various types of articles"""
    
    # Load the trained model
    detector = EnhancedFakeNewsDetector()
    success = detector.load_enhanced_model('enhanced_fake_news_detector_v2.pkl')
    
    if not success:
        print("❌ Could not load the model. Please run the training first.")
        return
    
    # Test articles from different categories
    test_articles = [
        {
            'title': 'Credible Scientific Article',
            'text': 'Researchers at Harvard Medical School published a peer-reviewed study in the Journal of Medical Research showing that a Mediterranean diet may reduce cardiovascular disease risk by 30% in a randomized controlled trial with 7,500 participants over five years.'
        },
        {
            'title': 'Obvious Fake News',
            'text': 'BREAKING!!! Scientists SHOCKED by this ONE WEIRD TRICK that INSTANTLY cures diabetes! Doctors HATE this simple method discovered by a grandmother! Big Pharma is trying to HIDE this MIRACLE cure! Click NOW before it\'s BANNED!!!'
        },
        {
            'title': 'Neutral News Report',
            'text': 'The Federal Reserve announced today a decision to maintain current interest rates at 5.25% following their monthly meeting. The decision was based on recent economic indicators including employment data and inflation measurements, according to Fed Chairman statements.'
        },
        {
            'title': 'Suspicious Health Claim',
            'text': 'INCREDIBLE discovery: This ancient herb from the Amazon rainforest can allegedly cure ANY disease within 24 hours! Thousands of people are claiming AMAZING results! The medical establishment doesn\'t want you to know about this POWERFUL secret!'
        },
        {
            'title': 'Legitimate Technology News',
            'text': 'Apple Inc. reported quarterly earnings that exceeded analyst expectations, with iPhone sales contributing to a 12% revenue increase compared to the same period last year. The company also announced plans to expand manufacturing operations in Southeast Asia.'
        }
    ]
    
    print("🔍 Testing Enhanced Fake News Detection System")
    print("=" * 60)
    
    for i, article in enumerate(test_articles, 1):
        print(f"\n📰 Test Article {i}: {article['title']}")
        print(f"Text: {article['text'][:100]}...")
        
        # Get predictions
        predictions = detector.predict_with_confidence(article['text'])
        explanation = detector.generate_explanation(article['text'], predictions)
        
        if "error" in predictions:
            print(f"❌ Error: {predictions['error']}")
            continue
        
        # Display results
        print("\n🤖 AI Predictions:")
        for model_name, result in predictions.items():
            if 'prediction' in result:
                emoji = "✅" if result['prediction'] == 'Real' else "⚠️"
                print(f"  {emoji} {model_name}: {result['prediction']} "
                      f"({result['confidence']:.1%} confidence)")
        
        # Display explanation
        if explanation and not 'error' in explanation:
            print(f"\n💡 Analysis: {explanation.get('summary', 'N/A')}")
            print(f"🎯 Confidence Level: {explanation.get('confidence_level', 'N/A')}")
            
            if explanation.get('key_indicators'):
                print("🔍 Key Indicators:")
                for indicator in explanation['key_indicators']:
                    print(f"   • {indicator}")
            
            print(f"💬 Recommendation: {explanation.get('recommendation', 'N/A')}")
        
        print("-" * 60)

def interactive_testing():
    """Interactive testing where user can input their own articles"""
    
    detector = EnhancedFakeNewsDetector()
    success = detector.load_enhanced_model('enhanced_fake_news_detector_v2.pkl')
    
    if not success:
        print("❌ Could not load the model. Please run the training first.")
        return
    
    print("\n🎮 Interactive Fake News Detection")
    print("Enter articles to analyze (type 'quit' to exit)")
    print("=" * 50)
    
    while True:
        user_input = input("\n📝 Enter article text: ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            break
        
        if len(user_input) < 10:
            print("⚠️ Please enter a longer article (at least 10 characters)")
            continue
        
        # Analyze the article
        predictions = detector.predict_with_confidence(user_input)
        explanation = detector.generate_explanation(user_input, predictions)
        
        if "error" in predictions:
            print(f"❌ Error: {predictions['error']}")
            continue
        
        # Show results
        print("\n🔍 Analysis Results:")
        
        # Get consensus prediction
        if 'Consensus' in predictions:
            main_result = predictions['Consensus']
            emoji = "✅" if main_result['prediction'] == 'Real' else "⚠️"
            print(f"{emoji} Overall Assessment: {main_result['prediction']} "
                  f"({main_result['confidence']:.1%} confidence)")
        
        # Show explanation
        if explanation and 'error' not in explanation:
            print(f"💡 {explanation.get('summary', '')}")
            print(f"💬 {explanation.get('recommendation', '')}")

if __name__ == "__main__":
    # Run automated tests
    test_with_real_articles()
    
    # Uncomment the line below for interactive testing
    # interactive_testing()

2025-09-06 23:54:07,223 - INFO - Enhanced model loaded from enhanced_fake_news_detector_v2.pkl
2025-09-06 23:54:07,225 - INFO - Loaded 4 individual models
2025-09-06 23:54:07,227 - INFO - Preprocessing and extracting enhanced features...
2025-09-06 23:54:07,237 - INFO - PCA applied: 30 components retained
2025-09-06 23:54:07,238 - INFO - Final feature shape: (1, 30)
2025-09-06 23:54:07,249 - ERROR - Error in ensemble prediction: This VotingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
2025-09-06 23:54:07,252 - INFO - Preprocessing and extracting enhanced features...
2025-09-06 23:54:07,259 - INFO - PCA applied: 30 components retained
2025-09-06 23:54:07,260 - INFO - Final feature shape: (1, 30)
2025-09-06 23:54:07,272 - ERROR - Error in ensemble prediction: This VotingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
2025-09-06 23:54:07,275 - INFO - Preprocessing and extr

🔍 Testing Enhanced Fake News Detection System

📰 Test Article 1: Credible Scientific Article
Text: Researchers at Harvard Medical School published a peer-reviewed study in the Journal of Medical Rese...

🤖 AI Predictions:
  ✅ Logistic_Regression: Real (100.0% confidence)
  ✅ Random_Forest: Real (76.0% confidence)
  ✅ SVM: Real (77.1% confidence)
  ✅ Gradient_Boosting: Real (100.0% confidence)
  ✅ Consensus: Real (88.3% confidence)

💡 Analysis: This article is classified as REAL news with 88.3% confidence.
🎯 Confidence Level: High
💬 Recommendation: Appears to be legitimate news, but still verify if sharing important information.
------------------------------------------------------------

📰 Test Article 2: Obvious Fake News
Text: BREAKING!!! Scientists SHOCKED by this ONE WEIRD TRICK that INSTANTLY cures diabetes! Doctors HATE t...

🤖 AI Predictions:
  ⚠️ Logistic_Regression: Fake (99.9% confidence)
  ⚠️ Random_Forest: Fake (58.0% confidence)
  ⚠️ SVM: Fake (78.3% confidence)
  ⚠️ Grad

In [4]:
from enhanced_fake_news_detector_v2 import main_enhanced

print("🚀 Running Enhanced Fake News Detection System v2.0...")
detector = main_enhanced()

if detector:
    print("\n✅ System initialized successfully!")
else:
    print("❌ System failed to initialize")


2025-09-06 23:54:28,393 - INFO - Creating comprehensive dataset...
2025-09-06 23:54:28,396 - INFO - Dataset created: 50 articles
2025-09-06 23:54:28,397 - INFO - Real news: 25, Fake news: 25
2025-09-06 23:54:28,398 - INFO - Preprocessing and extracting enhanced features...
2025-09-06 23:54:28,510 - INFO - PCA applied: 30 components retained
2025-09-06 23:54:28,512 - INFO - Final feature shape: (50, 30)
2025-09-06 23:54:28,517 - INFO - Training set: 40 samples
2025-09-06 23:54:28,518 - INFO - Test set: 10 samples
2025-09-06 23:54:28,520 - INFO - Training models with cross-validation...
2025-09-06 23:54:28,522 - INFO - Training Multinomial_NB...
2025-09-06 23:54:28,565 - ERROR - Error training Multinomial_NB: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with 

🚀 Running Enhanced Fake News Detection System v2.0...
ENHANCED FAKE NEWS DETECTION SYSTEM V2.0


2025-09-06 23:54:28,567 - INFO - Training Logistic_Regression...
2025-09-06 23:54:28,609 - INFO - Logistic_Regression - CV: 1.0000 (±0.0000)
2025-09-06 23:54:28,611 - INFO - Training Random_Forest...
2025-09-06 23:54:29,749 - INFO - Random_Forest - CV: 1.0000 (±0.0000)
2025-09-06 23:54:29,751 - INFO - Training SVM...
2025-09-06 23:54:29,797 - INFO - SVM - CV: 1.0000 (±0.0000)
2025-09-06 23:54:29,798 - INFO - Training Gradient_Boosting...
2025-09-06 23:54:30,192 - INFO - Gradient_Boosting - CV: 1.0000 (±0.0000)
2025-09-06 23:54:30,193 - INFO - Ensemble model created successfully
2025-09-06 23:54:30,195 - INFO - Comprehensive model evaluation...
2025-09-06 23:54:30,208 - INFO - Logistic_Regression - Accuracy: 1.0000, F1: 1.0000, AUC: 1.0000
2025-09-06 23:54:30,234 - INFO - Random_Forest - Accuracy: 1.0000, F1: 1.0000, AUC: 1.0000
2025-09-06 23:54:30,251 - INFO - SVM - Accuracy: 1.0000, F1: 1.0000, AUC: 1.0000
2025-09-06 23:54:30,265 - INFO - Gradient_Boosting - Accuracy: 1.0000, F1: 1.00


TESTING WITH ENHANCED PREDICTIONS

Test Article 1: Scientists at MIT have developed a revolutionary new battery technology accordin...

Predictions:
  Logistic_Regression: Real (Confidence: 0.992)
  Random_Forest: Real (Confidence: 0.710)
  SVM: Real (Confidence: 0.984)
  Gradient_Boosting: Real (Confidence: 1.000)
  Consensus: Real (Confidence: 0.921)

Explanation: This article is classified as REAL news with 92.1% confidence.
Confidence Level: Very High
Recommendation: Appears to be legitimate news, but still verify if sharing important information.

Test Article 2: BREAKING: Government officials confirm alien contact, world leaders to make SHOC...

Predictions:
  Logistic_Regression: Fake (Confidence: 0.990)
  Random_Forest: Fake (Confidence: 0.760)
  SVM: Fake (Confidence: 0.980)
  Gradient_Boosting: Fake (Confidence: 1.000)
  Consensus: Fake (Confidence: 0.933)

Explanation: This article is classified as FAKE news with 93.3% confidence.
Confidence Level: Very High
Key Indicators: