In [None]:
# Cell 1: Install required packages
!pip install pandas numpy scikit-learn nltk flask-ngrok torch transformers flask-cors
!pip install pyngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Collecting flask-cors
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Dow

In [None]:
# Cell 2: Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score
import pickle
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok
from flask_cors import CORS
from pyngrok import ngrok
import warnings
warnings.filterwarnings('ignore')

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:

# Cell 4: Load and prepare dataset
# Update these paths to your dataset location

fake_df = pd.read_csv('Fake.csv', encoding = 'latin-1')
true_df = pd.read_csv('True.csv', encoding = 'latin-1')

fake_df['label'] = 0  # Fake news
true_df['label'] = 1  # True news

df = pd.concat([fake_df, true_df], axis=0)
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (22848, 5)


Unnamed: 0,title,text,subject,date,label
0,Trump LOVES To Bash Second Place Finishers An...,Donald Trump came in second place in the Iowa ...,News,"February 2, 2016",0
1,CAMPING NIGHTMARE: Machete Wielding Refugee Dr...,A refugee from Ghana has been arrested for dra...,left-news,"Apr 9, 2017",0
2,BUSINESS OWNERS GET RICH Providing Luxury Hous...,"Meanwhile, refugees staying at these luxury ...",left-news,"Feb 8, 2016",0
3,Weâre Not F*cking With You: Trump Stole Par...,If you thought it was hilarious that Trump s 2...,News,"January 20, 2017",0
4,These Words From Trumpâs Own Mouth OBLITERA...,Donald Trump must think women are stupid. Rece...,News,"September 15, 2016",0


In [None]:
# Cell 5: Text preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_text)

In [None]:
# Cell 6: Split dataset
X = df['clean_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Cell 7: TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Save vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [None]:
# Cell 8: Train PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train_tfidf, y_train)

# Save model
with open('pac_model.pkl', 'wb') as f:
    pickle.dump(pac, f)

# Evaluate
y_pred = pac.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")

Accuracy: 99.98%


In [None]:
# Cell 9: Prepare DistilBERT model
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize data
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)

train_dataset = NewsDataset(train_encodings, y_train)
test_dataset = NewsDataset(test_encodings, y_test)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
# Cell 10: Train DistilBERT
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Update: Removed 'evaluation_strategy', 'eval_steps', 'save_strategy', 'save_steps'
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    # evaluation_strategy='steps',  # Removed in older versions
    # eval_steps=100,              # Removed in older versions
    # save_strategy='steps',        # Removed in older versions
    # save_steps=100,              # Removed in older versions
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()
model.save_pretrained('./distilbert-news')
tokenizer.save_pretrained('./distilbert-news')

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpurvakapasi1[0m ([33mpurvakapasi1-karnavati-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.4896
100,0.0528
150,0.0066
200,0.0023
250,0.001
300,0.0005
350,0.0005
400,0.0226
450,0.0003
500,0.0001


Step,Training Loss
50,0.4896
100,0.0528
150,0.0066
200,0.0023
250,0.001
300,0.0005
350,0.0005
400,0.0226
450,0.0003
500,0.0001


In [None]:
# Cell 11: Flask Backend
app = Flask(__name__)
CORS(app)
run_with_ngrok(app)

# Load models
with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)

with open('pac_model.pkl', 'rb') as f:
    pac = pickle.load(f)

bert_model = DistilBertForSequenceClassification.from_pretrained('./distilbert-news')
bert_tokenizer = DistilBertTokenizer.from_pretrained('./distilbert-news')
bert_model.to('cpu')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    text = data['text']
    cleaned = clean_text(text)

    # PAC prediction
    tfidf_text = tfidf.transform([cleaned])
    pac_pred = pac.predict(tfidf_text)[0]
    pac_conf = max(pac.predict_proba(tfidf_text)[0])

    # BERT prediction
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    bert_conf = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].numpy()
    bert_pred = np.argmax(bert_conf)
    bert_conf = max(bert_conf)

    # Ensemble prediction
    final_pred = 1 if (pac_pred + bert_pred) > 0 else 0

    return jsonify({
        'pac_pred': int(pac_pred),
        'pac_conf': float(pac_conf),
        'bert_pred': int(bert_pred),
        'bert_conf': float(bert_conf),
        'final_pred': int(final_pred)
    })

@app.route('/')
def home():
    return "Fake News Detection API Running!"

if __name__ == '__main__':
    app.run()

In [None]:
# Cell 12: Frontend
frontend_html = """
<!DOCTYPE html>
<html>
<head>
    <title>Fake News Detector</title>
    <style>
        body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
        .container { background: #f5f5f5; padding: 20px; border-radius: 10px; }
        textarea { width: 100%; height: 200px; margin: 10px 0; padding: 10px; }
        button { background: #4CAF50; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer; }
        .result { margin-top: 20px; padding: 15px; border-radius: 5px; }
        .real { background: #dff0d8; border: 1px solid #d6e9c6; color: #3c763d; }
        .fake { background: #f2dede; border: 1px solid #ebccd1; color: #a94442; }
    </style>
</head>
<body>
    <div class="container">
        <h1>Fake News Detector</h1>
        <textarea id="inputText" placeholder="Paste news article here..."></textarea>
        <button onclick="analyze()">Check Authenticity</button>
        <div id="result" class="result"></div>
    </div>

    <script>
        async function analyze() {
            const text = document.getElementById('inputText').value;
            const resultDiv = document.getElementById('result');

            if (!text) {
                resultDiv.innerHTML = "Please enter some text to analyze!";
                return;
            }

            resultDiv.innerHTML = "Analyzing...";

            try {
                const response = await fetch('YOUR_NGROK_URL/predict', {
                    method: 'POST',
                    headers: { 'Content-Type': 'application/json' },
                    body: JSON.stringify({ text: text })
                });

                const data = await response.json();
                displayResult(data);
            } catch (error) {
                resultDiv.innerHTML = "Error analyzing text";
            }
        }

        function displayResult(data) {
            const resultDiv = document.getElementById('result');
            const isReal = data.final_pred === 1;

            resultDiv.className = `result ${isReal ? 'real' : 'fake'}`;
            resultDiv.innerHTML = `
                <h2>${isReal ? '✅ Likely Real News' : '❌ Likely Fake News'}</h2>
                <p>Final Confidence: ${Math.max(data.pac_conf, data.bert_conf).toFixed(2)*100}%</p>
                <h3>Model Details:</h3>
                <p>Traditional Model: ${data.pac_pred ? 'Real' : 'Fake'} (${(data.pac_conf*100).toFixed(2)}%)</p>
                <p>BERT Model: ${data.bert_pred ? 'Real' : 'Fake'} (${(data.bert_conf*100).toFixed(2)}%)</p>
            `;
        }
    </script>
</body>
</html>
"""

with open('frontend.html', 'w') as f:
    f.write(frontend_html)

In [None]:
# Cell 13: Run Application
# Get ngrok URL
public_url = ngrok.connect(5000).public_url
print(" * Flask app running on:", public_url)

# Update frontend with ngrok URL
with open('frontend.html', 'r') as f:
    content = f.read()
content = content.replace('YOUR_NGROK_URL', public_url)
with open('frontend.html', 'w') as f:
    f.write(content)

# Start Flask app
app.run()