In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv("/kaggle/input/textdb3/fake_or_real_news.csv") 
x = (df['title'].fillna('') + " " + df['text'].fillna(''))
y = df['label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
len(X_train), len(X_test),len(y_train), len(y_test)

(5068, 1267, 5068, 1267)

In [6]:
vectorize = TfidfVectorizer(max_features = 5000)
X_train_tfidf = vectorize.fit_transform(X_train)
X_test_tfidf = vectorize.transform(X_test)

In [7]:
X_train_tfidf[0], y_train[0]

(<Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 19 stored elements and shape (1, 5000)>,
 'FAKE')

In [8]:
clf = LogisticRegression(max_iter = 1000)
clf.fit(X_train_tfidf, y_train)

In [9]:
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        FAKE       0.91      0.93      0.92       628
        REAL       0.93      0.91      0.92       639

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267



In [15]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

X_train_emb = embedder.encode(X_train.tolist(), show_progress_bar=True, batch_size=32)
X_test_emb = embedder.encode(X_test.tolist(), show_progress_bar=True, batch_size=32)

clf = LogisticRegression(max_iter = 2000)
clf.fit(X_train_emb,y_train)

y_pred = clf.predict(X_test_emb)
print(classification_report(y_test, y_pred))

Batches:   0%|          | 0/159 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

        FAKE       0.86      0.87      0.86       628
        REAL       0.87      0.86      0.86       639

    accuracy                           0.86      1267
   macro avg       0.86      0.86      0.86      1267
weighted avg       0.86      0.86      0.86      1267



In [18]:
new_texts = [
    "Breaking: The government announced new tax reforms today.",
    "Aliens landed in New York last night and took selfies with humans."
]

new_data_emb = embedder.encode(new_texts,show_progress_bar=False)

y_pred = clf.predict(new_data_emb)

In [19]:
for text, label in zip(new_texts, y_pred):
    print(f"Text: {text}\nPrediction: {label}\n")

Text: Breaking: The government announced new tax reforms today.
Prediction: FAKE

Text: Aliens landed in New York last night and took selfies with humans.
Prediction: FAKE



In [20]:
probs = clf.predict_proba(new_data_emb)
print(probs)


[[0.56301885 0.43698115]
 [0.95660156 0.04339844]]


In [24]:
clf = joblib.load("fake_news_classifier.pkl")
embedder = SentenceTransformer("sentence_embedder")

def predict_news(texts):
    if isinstance(texts, str):
        texts = [texts]
    emb = embedder.encode(texts,show_progress_bar=False)
    preds = clf.predict(emb)
    probs = clf.predict_proba(emb)
    results = []
    for t, p, prob in zip(texts, preds, probs):
        results.append({
            "text": t,
            "prediction": "Real" if p == 1 else "Fake",
            "confidence_fake": round(prob[0], 3),
            "confidence_real": round(prob[1], 3)
        })
    return results

In [25]:
examples = [
    "Government announces new health policies for 2025.",
    "Aliens landed in New York last night and started a concert."
]

results = predict_news(examples)

for r in results:
    print(f"Text: {r['text']}")
    print(f"Prediction: {r['prediction']}")
    print(f"Confidence -> Fake: {r['confidence_fake']} | Real: {r['confidence_real']}\n")


Text: Government announces new health policies for 2025.
Prediction: Fake
Confidence -> Fake: 0.606 | Real: 0.394

Text: Aliens landed in New York last night and started a concert.
Prediction: Fake
Confidence -> Fake: 0.974 | Real: 0.026

