In [None]:
# ✅ Import Libraries
import pandas as pd
import numpy as np
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle
import pickle

# ✅ Download NLTK Data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# ✅ Load Dataset
df = pd.read_csv("/content/news.csv")  # Make sure to upload your dataset in the same directory

# ✅ Define Stopwords and Lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words]  # Keep stopwords!
    return " ".join(words)

# ✅ Apply Text Preprocessing
df["clean_text"] = df["text"].apply(preprocess_text)

# ✅ Convert Labels to Numeric
df["label"] = df["label"].fillna(df["label"].mode()[0])
df["label"] = df["label"].map({"REAL": 0, "FAKE": 1})

# ✅ Check & Remove Any NaN Labels
df = df.dropna(subset=["label"])

# ✅ Remove Unnecessary Column
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

# ✅ Shuffle Dataset
df = shuffle(df, random_state=42)

# ✅ Convert Text to Numerical Features (Reduced Features)
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df["clean_text"]).toarray()
y = df["label"].values

# ✅ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ✅ Train Naïve Bayes Model (Better for Text Data)
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# ✅ Predictions & Accuracy
y_pred_nb = nb_model.predict(X_test)
print("\n✅ Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_nb))

# ✅ Save the trained model and vectorizer
# Save the Naïve Bayes model
with open('nb_model.pkl', 'wb') as model_file:
    pickle.dump(nb_model, model_file)

# Save the TF-IDF Vectorizer
with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and vectorizer saved successfully!")

# ✅ Prediction Function
def predict_news(news_text):
    processed_text = preprocess_text(news_text)
    transformed_text = vectorizer.transform([processed_text]).toarray()
    prediction = nb_model.predict(transformed_text)
    confidence = nb_model.predict_proba(transformed_text)
    label = "Fake News ❌" if prediction[0] == 1 else "Real News ✅"
    return f"Processed Input: {processed_text}\nPrediction: {label}\nConfidence: {confidence}"

# ✅ Test with a Sample News Article
news_text = """The President of the United States met with world leaders today to discuss climate change policies.
The summit focused on reducing carbon emissions and increasing green energy investments."""
print(predict_news(news_text))


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...



✅ Naïve Bayes Accuracy: 0.857969489742241

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85       930
           1       0.85      0.87      0.86       971

    accuracy                           0.86      1901
   macro avg       0.86      0.86      0.86      1901
weighted avg       0.86      0.86      0.86      1901

Model and vectorizer saved successfully!
Processed Input: the president of the united state met with world leader today to discus climate change policy the summit focused on reducing carbon emission and increasing green energy investment
Prediction: Real News ✅
Confidence: [[0.66602737 0.33397263]]


In [None]:
# ✅ Save the trained model and vectorizer
# Save the Naïve Bayes model
with open('nb_model.pkl', 'wb') as model_file:
    pickle.dump(nb_model, model_file)

# Save the TF-IDF Vectorizer
with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [None]:
from google.colab import files

# Download the model and vectorizer
files.download('nb_model.pkl')
files.download('vectorizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>