In [1]:
# Fake News Detection System - End-to-End Implementation
# Step-by-step code for detecting AI-generated fake news

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import requests
from bs4 import BeautifulSoup
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

class FakeNewsDetector:
    def __init__(self):
        """Initialize the Fake News Detection System"""
        self.tfidf_vectorizer = None
        self.models = {}
        self.bert_classifier = None
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        
    def load_data(self, file_path=None, create_sample=True):
        """
        Step 1: Load and prepare the dataset
        """
        print("Step 1: Loading Dataset...")
        
        if create_sample and file_path is None:
            # Create sample data if no file provided
            sample_data = {
                'text': [
                    "Scientists have discovered a new planet that could support life",
                    "Breaking: Aliens have landed in New York City according to unnamed sources",
                    "COVID-19 vaccine shows 95% efficacy in clinical trials",
                    "Miracle cure found that eliminates all diseases instantly",
                    "Stock market reaches new highs amid economic recovery",
                    "Secret government plan to control minds through 5G towers revealed",
                    "Research shows meditation can reduce stress and anxiety",
                    "Local man claims he can predict the future with 100% accuracy"
                ],
                'label': [1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Real, 0 = Fake
            }
            df = pd.DataFrame(sample_data)
        else:
            # Load from CSV file
            df = pd.read_csv(file_path)
            
        print(f"Dataset loaded: {len(df)} articles")
        print(f"Real news: {sum(df['label'])}, Fake news: {len(df) - sum(df['label'])}")
        
        return df
    
    def preprocess_text(self, text):
        """
        Step 2: Text preprocessing and feature extraction
        """
        if pd.isna(text):
            return ""
            
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords and stem
        tokens = [self.stemmer.stem(token) for token in tokens if token not in self.stop_words]
        
        return ' '.join(tokens)
    
    def extract_linguistic_features(self, text):
        """
        Step 3: Extract linguistic and stylistic features
        """
        features = {}
        
        # Basic text statistics
        features['char_count'] = len(text)
        features['word_count'] = len(text.split())
        features['sentence_count'] = len(sent_tokenize(text))
        features['avg_word_length'] = np.mean([len(word) for word in text.split()])
        
        # Punctuation features
        features['exclamation_count'] = text.count('!')
        features['question_count'] = text.count('?')
        features['capital_ratio'] = sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0
        
        # Emotional indicators
        emotional_words = ['amazing', 'shocking', 'unbelievable', 'incredible', 'outrageous']
        features['emotional_word_count'] = sum(1 for word in emotional_words if word in text.lower())
        
        # Credibility indicators
        source_words = ['according to', 'sources say', 'reports suggest', 'study shows']
        features['source_mention_count'] = sum(1 for phrase in source_words if phrase in text.lower())
        
        return features
    
    def prepare_features(self, df):
        """
        Step 4: Prepare features for machine learning
        """
        print("Step 2-4: Preprocessing and Feature Extraction...")
        
        # Preprocess text
        df['cleaned_text'] = df['text'].apply(self.preprocess_text)
        
        # Extract linguistic features
        linguistic_features = []
        for text in df['text']:
            features = self.extract_linguistic_features(text)
            linguistic_features.append(features)
        
        linguistic_df = pd.DataFrame(linguistic_features)
        
        # TF-IDF vectorization
        if self.tfidf_vectorizer is None:
            self.tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
            tfidf_features = self.tfidf_vectorizer.fit_transform(df['cleaned_text'])
        else:
            tfidf_features = self.tfidf_vectorizer.transform(df['cleaned_text'])
        
        # Combine TF-IDF with linguistic features
        tfidf_dense = tfidf_features.toarray()
        combined_features = np.hstack([tfidf_dense, linguistic_df.values])
        
        return combined_features, df['label']
    
    def train_traditional_models(self, X_train, y_train):
        """
        Step 5: Train traditional ML models
        """
        print("Step 5: Training Traditional ML Models...")
        
        # Initialize models
        models = {
            'Naive Bayes': MultinomialNB(),
            'Logistic Regression': LogisticRegression(random_state=42),
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
        }
        
        # Train models
        for name, model in models.items():
            print(f"Training {name}...")
            model.fit(X_train, y_train)
            self.models[name] = model
            
        print("Traditional models trained successfully!")
    
    def setup_bert_classifier(self):
        """
        Step 6: Setup BERT-based classifier
        """
        print("Step 6: Setting up BERT Classifier...")
        
        try:
            # Use a pre-trained model for fake news detection
            model_name = "distilbert-base-uncased"
            self.bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
            
            # Create a text classification pipeline
            self.bert_classifier = pipeline(
                "text-classification",
                model=model_name,
                tokenizer=self.bert_tokenizer,
                device=0 if torch.cuda.is_available() else -1
            )
            print("BERT classifier setup complete!")
            
        except Exception as e:
            print(f"BERT setup failed: {e}")
            print("Continuing with traditional models only...")
    
    def evaluate_models(self, X_test, y_test):
        """
        Step 7: Evaluate model performance
        """
        print("Step 7: Evaluating Models...")
        
        results = {}
        
        for name, model in self.models.items():
            # Make predictions
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            results[name] = {
                'accuracy': accuracy,
                'predictions': y_pred,
                'classification_report': classification_report(y_test, y_pred)
            }
            
            print(f"\n{name} Results:")
            print(f"Accuracy: {accuracy:.4f}")
            print("Classification Report:")
            print(classification_report(y_test, y_pred))
        
        return results
    
    def predict_single_article(self, text):
        """
        Step 8: Make predictions on new articles
        """
        # Preprocess the text
        cleaned_text = self.preprocess_text(text)
        
        # Extract linguistic features
        linguistic_features = self.extract_linguistic_features(text)
        linguistic_array = np.array(list(linguistic_features.values())).reshape(1, -1)
        
        # TF-IDF features
        tfidf_features = self.tfidf_vectorizer.transform([cleaned_text])
        
        # Combine features
        combined_features = np.hstack([tfidf_features.toarray(), linguistic_array])
        
        # Get predictions from all models
        predictions = {}
        for name, model in self.models.items():
            pred = model.predict(combined_features)[0]
            prob = model.predict_proba(combined_features)[0]
            predictions[name] = {
                'prediction': 'Real' if pred == 1 else 'Fake',
                'confidence': max(prob)
            }
        
        return predictions
    
    def scrape_news_article(self, url):
        """
        Step 9: Web scraping utility for real-time detection
        """
        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract text from common news article tags
            article_text = ""
            for tag in ['article', 'div[class*="content"]', 'p']:
                elements = soup.select(tag)
                for element in elements:
                    article_text += element.get_text() + " "
            
            return article_text.strip()
        
        except Exception as e:
            print(f"Error scraping article: {e}")
            return None
    
    def save_model(self, filepath):
        """
        Step 10: Save trained models
        """
        model_data = {
            'models': self.models,
            'tfidf_vectorizer': self.tfidf_vectorizer
        }
        
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        
        print(f"Models saved to {filepath}")
    
    def load_model(self, filepath):
        """
        Step 10: Load pre-trained models
        """
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        self.models = model_data['models']
        self.tfidf_vectorizer = model_data['tfidf_vectorizer']
        
        print(f"Models loaded from {filepath}")

# Main execution pipeline
def main():
    """
    Complete pipeline execution
    """
    print("=== FAKE NEWS DETECTION SYSTEM ===\n")
    
    # Initialize detector
    detector = FakeNewsDetector()
    
    # Step 1: Load data
    df = detector.load_data(create_sample=True)
    
    # Step 2-4: Prepare features
    X, y = detector.prepare_features(df)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Step 5: Train models
    detector.train_traditional_models(X_train, y_train)
    
    # Step 6: Setup BERT (optional)
    detector.setup_bert_classifier()
    
    # Step 7: Evaluate models
    results = detector.evaluate_models(X_test, y_test)
    
    # Step 8: Test with sample articles
    print("\n" + "="*50)
    print("TESTING WITH SAMPLE ARTICLES")
    print("="*50)
    
    test_articles = [
        "Scientists at MIT have developed a revolutionary new battery technology that could change everything",
        "BREAKING: Government officials confirm alien contact, world leaders to make announcement tomorrow",
        "Local hospital reports successful treatment of COVID-19 patients with new therapy"
    ]
    
    for i, article in enumerate(test_articles, 1):
        print(f"\nTest Article {i}: {article[:100]}...")
        predictions = detector.predict_single_article(article)
        
        for model_name, result in predictions.items():
            print(f"{model_name}: {result['prediction']} (Confidence: {result['confidence']:.3f})")
    
    # Step 10: Save models
    detector.save_model('fake_news_detector.pkl')
    
    print("\n" + "="*50)
    print("FAKE NEWS DETECTION SYSTEM COMPLETE!")
    print("="*50)

if __name__ == "__main__":
    main()


# Additional utility functions for deployment

class FakeNewsAPI:
    """
    Step 11: API wrapper for deployment
    """
    def __init__(self, model_path):
        self.detector = FakeNewsDetector()
        self.detector.load_model(model_path)
    
    def analyze_text(self, text):
        """Analyze text and return detailed results"""
        predictions = self.detector.predict_single_article(text)
        
        # Calculate ensemble prediction
        fake_votes = sum(1 for pred in predictions.values() if pred['prediction'] == 'Fake')
        real_votes = len(predictions) - fake_votes
        
        ensemble_prediction = 'Fake' if fake_votes > real_votes else 'Real'
        confidence = max(fake_votes, real_votes) / len(predictions)
        
        return {
            'text': text,
            'ensemble_prediction': ensemble_prediction,
            'confidence': confidence,
            'individual_models': predictions,
            'timestamp': datetime.now().isoformat()
        }
    
    def analyze_url(self, url):
        """Analyze article from URL"""
        text = self.detector.scrape_news_article(url)
        if text:
            return self.analyze_text(text)
        else:
            return {'error': 'Unable to scrape article content'}

# Example usage for deployment
"""
# Initialize API
api = FakeNewsAPI('fake_news_detector.pkl')

# Analyze text
result = api.analyze_text("Your news article text here...")
print(result)

# Analyze URL
result = api.analyze_url("https://example.com/news-article")
print(result)
"""

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== FAKE NEWS DETECTION SYSTEM ===

Step 1: Loading Dataset...
Dataset loaded: 8 articles
Real news: 4, Fake news: 4
Step 2-4: Preprocessing and Feature Extraction...
Step 5: Training Traditional ML Models...
Training Naive Bayes...
Training Logistic Regression...
Training Random Forest...
Traditional models trained successfully!
Step 6: Setting up BERT Classifier...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/distilbert-base-uncased/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


BERT classifier setup complete!
Step 7: Evaluating Models...

Naive Bayes Results:
Accuracy: 0.0000
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0


Logistic Regression Results:
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


Random Forest Results:
Accuracy: 0.0000
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00 

'\n# Initialize API\napi = FakeNewsAPI(\'fake_news_detector.pkl\')\n\n# Analyze text\nresult = api.analyze_text("Your news article text here...")\nprint(result)\n\n# Analyze URL\nresult = api.analyze_url("https://example.com/news-article")\nprint(result)\n'

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [5]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data] Error downloading 'punkt_tab' from
[nltk_data]     <https://raw.githubusercontent.com/nltk/nltk_data/gh-
[nltk_data]     pages/packages/tokenizers/punkt_tab.zip>:   <urlopen
[nltk_data]     error [Errno 11001] getaddrinfo failed>


False

In [6]:
import nltk
print(nltk.data.path)


['C:\\Users\\admin/nltk_data', 'C:\\Users\\admin\\anaconda3\\envs\\Gen_AI\\nltk_data', 'C:\\Users\\admin\\anaconda3\\envs\\Gen_AI\\share\\nltk_data', 'C:\\Users\\admin\\anaconda3\\envs\\Gen_AI\\lib\\nltk_data', 'C:\\Users\\admin\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [7]:
import nltk

def ensure_nltk_data():
    resources = ["stopwords", "punkt", "punkt_tab"]
    for resource in resources:
        try:
            nltk.data.find(f"corpora/{resource}") if resource == "stopwords" else nltk.data.find(f"tokenizers/{resource}")
        except LookupError:
            print(f"Downloading {resource}...")
            nltk.download(resource)

# Call this at the start of your script
ensure_nltk_data()


Downloading punkt_tab...


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "distilbert-base-uncased"
AutoTokenizer.from_pretrained(model_name, cache_dir="./models")
AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir="./models")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/distilbert-base-uncased/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /distilbert-base-uncased/resolve/main/model.safetensors (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001DF54C78CE0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 58358363-3d98-4502-bf8e-075c0abcf12a)')