<a href="https://colab.research.google.com/github/Indhupamula/ai-powered-fake-news-detector/blob/main/ai_powered_fake_news_detection_in_real_time.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load CSV files
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

# Add labels
fake['label'] = 'Fake'
true['label'] = 'Real'

# Combine and shuffle
data = pd.concat([fake, true])
data = pd.concat([fake, true]).sample(frac=1, random_state=42).reset_index(drop=True)

# Keep only the columns we need
data = data[['text', 'label']]

# Preview first few rows
data.head()



In [None]:
import re

# Define a text cleaning function
def clean_text(text):
    text = str(text).lower()                  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)           # Remove punctuation
    text = re.sub(r'\s+', ' ', text)          # Remove extra whitespace
    return text

# Apply the cleaning to all text entries
data['text'] = data['text'].apply(clean_text)

# Preview cleaned data
data.head()
data['label'].value_counts()



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.7,
    min_df=3,               # Remove very rare words
    ngram_range=(1, 3),     # Include unigrams, bigrams, trigrams
    sublinear_tf=True       # Normalize term frequency
)

X = vectorizer.fit_transform(data['text'])
y = data['label']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score

# Predict on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Show accuracy
print(f" Model Accuracy: {accuracy * 100:.2f}%")


In [None]:
model.classes_


In [None]:
def predict_news_debug(text):
    cleaned = clean_text(text)
    vec = vectorizer.transform([cleaned])
    probs = model.predict_proba(vec)[0]
    return {
        'Fake': f"{probs[model.classes_.tolist().index('Fake')]*100:.2f}%",
        'Real': f"{probs[model.classes_.tolist().index('Real')]*100:.2f}%"
    }


In [None]:
predict_news_debug(data[data['label'] == 'Real'].iloc[10]['text'])


In [None]:
import gradio as gr
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Prediction function
def predict_news_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
    labels = ["Fake", "Real"]
    pred_index = torch.argmax(probs).item()
    confidence = probs[pred_index].item() * 100
    return f"📰 Prediction: {labels[pred_index]} ({confidence:.2f}% confidence)"

# Gradio UI
demo = gr.Interface(
    fn=predict_news_bert,
    inputs=gr.Textbox(lines=8, placeholder="Paste news article here...", label="News Article"),
    outputs=gr.Textbox(label="Prediction"),
    title="🧠 BERT Powered Fake News Detector",
    description="Paste any news article to check whether it's Fake or Real using a BERT model."
)

# Launch
demo.launch()


In [None]:
from huggingface_hub import notebook_login
notebook_login()


In [None]:
# Step 4: Create project folder and go inside it
!mkdir FakeNewsApp
%cd FakeNewsApp

# Create app.py with your BERT + Gradio code
app_code = """
import gradio as gr
from transformers import BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

def predict_news_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
    labels = ["Fake", "Real"]
    pred_index = torch.argmax(probs).item()
    confidence = probs[pred_index].item() * 100
    return f"📰 Prediction: {labels[pred_index]} ({confidence:.2f}% confidence)"

demo = gr.Interface(
    fn=predict_news_bert,
    inputs=gr.Textbox(lines=8, label="News Article"),
    outputs=gr.Textbox(label="Prediction"),
    title=" BERT Fake News Detector",
    description="Paste a news article and find out if it's Fake or Real using BERT!"
)

demo.launch()
"""

with open("app.py", "w") as f:
    f.write(app_code)

# Create requirements.txt
with open("requirements.txt", "w") as f:
    f.write("gradio\ntransformers\ntorch\n")


In [None]:
from huggingface_hub import HfApi

api = HfApi()

# ✅ This is your actual space ID
repo_id = "indhupamula/fake-news-bert-indhu"

# Upload app.py
api.upload_file(
    path_or_fileobj="/content/FakeNewsApp/app.py",
    path_in_repo="app.py",
    repo_id=repo_id,
    repo_type="space"
)

# Upload requirements.txt
api.upload_file(
    path_or_fileobj="/content/FakeNewsApp/requirements.txt",
    path_in_repo="requirements.txt",
    repo_id=repo_id,
    repo_type="space"
)
