In [None]:
# 1Ô∏è‚É£ Import Required Libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import warnings

# Suppress sklearn warnings for cleaner output
warnings.filterwarnings('ignore', category=UserWarning)

# 2Ô∏è‚É£ Load Dataset with error handling
data_file = 'combined_news_data_cleaned.csv'

# Create sample data if file doesn't exist
if not os.path.exists(data_file):
    print(f"Dataset '{data_file}' not found. Creating sample data...")
    sample_data = {
        'text': [
            "Government announces new policies for education sector.",
            "Scientists discover breakthrough in renewable energy technology.",
            "Local community comes together to support flood victims.",
            "Breaking: Aliens have landed in downtown!",
            "Shocking: Local man turns into werewolf every full moon.",
            "Miracle cure discovered: Eating chocolate prevents all diseases.",
            "Celebrity spotted at local grocery store buying groceries.",
            "New research shows reading books improves cognitive function.",
            "Weather forecast predicts sunny skies for the weekend.",
            "Breaking: Dinosaurs found alive in remote jungle."
        ],
        'label': ['real', 'real', 'real', 'fake', 'fake', 'fake', 'real', 'real', 'real', 'fake']
    }
    df = pd.DataFrame(sample_data)
    df.to_csv('sample_news_data.csv', index=False)
    data_file = 'sample_news_data.csv'
    print(f"Sample data created: {data_file}")

# Load the dataset
try:
    data = pd.read_csv(data_file)
    print(f"Successfully loaded {len(data)} records from {data_file}")
except Exception as e:
    print(f"Error loading data: {str(e)}")
    raise

# 3Ô∏è‚É£ Drop missing or duplicate rows (extra safety)
data = data.dropna(subset=['text', 'label']).drop_duplicates()

# 4Ô∏è‚É£ Encode Labels (Important Step!)
le = LabelEncoder()
data['label_encoded'] = le.fit_transform(data['label'])

# 5Ô∏è‚É£ Split Data with stratification for better balance
X = data['text'].values
y = data['label_encoded'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 6Ô∏è‚É£ TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 7Ô∏è‚É£ Model Training (With Balanced Class Weights to Avoid Biased Predictions)
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train_vec, y_train)

# 8Ô∏è‚É£ Prediction & Evaluation
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred, zero_division=0))
print("Accuracy:", accuracy_score(y_test, y_pred))

# 9Ô∏è‚É£ Test Function for New News Text
def predict_news(news_text):
    """Predict whether a news article is fake or real."""
    try:
        news_vec = vectorizer.transform([news_text])
        prediction = model.predict(news_vec)
        label = le.inverse_transform(prediction)
        return label[0]
    except Exception as e:
        print(f"Error making prediction: {str(e)}")
        return None

# üîü Example Prediction:
sample_news = "Government announces new policies for education sector."
print("Prediction for sample news:", predict_news(sample_news))


Saving combined_news_data_cleaned.csv to combined_news_data_cleaned.csv
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.98      0.97      0.98      4685
          11       0.97      0.98      0.98      4249

    accuracy                           0.98      8936
   macro avg       0.49      0.49      0.49      8936
weighted avg       0.98      0.98      0.98      8936

Accuracy: 0.9763876454789615
Prediction for sample news: 1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
