# Fake News Detection – Naive Bayes Model

Ten projekt pokazuje krok po kroku:
1. Wczytanie danych (Fake + Real news)
2. Czyszczenie tekstu
3. Wektoryzację TF-IDF
4. Trenowanie modelu Naive Bayes
5. Analizę wyników i wizualizację macierzy pomyłek
6. Wizualizacje tekstu
7. Zastosowanie modelu w nowym tekście

In [None]:
!pip install -r requirements.txt


In [None]:
python.exe -m pip install --upgrade pip

In [None]:
!pip install --upgrade pip


In [None]:

import pandas as pd

# Wczytaj dane
fake_df = pd.read_csv("Fake.csv")
real_df = pd.read_csv("True.csv")

# Dodaj etykiety
fake_df["label"] = 0  # fake
real_df["label"] = 1  # real

# Połącz dane i wymieszaj
df = pd.concat([fake_df, real_df])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Sprawdzenie
print(df.head())
print("\nRozkład klas:")
print(df["label"].value_counts())


In [None]:

import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df["clean_text"] = df["text"].apply(clean_text)


In [None]:

from sklearn.model_selection import train_test_split

X = df["clean_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Rozmiar zbioru treningowego: {len(X_train)}")
print(f"Rozmiar zbioru testowego: {len(X_test)}")


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("Wektoryzacja zakończona.")
print(f"Kształt X_train: {X_train_tfidf.shape}")
print(f"Kształt X_test: {X_test_tfidf.shape}")


In [None]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)

print("Wyniki dla Naive Bayes:")
print(classification_report(y_test, y_pred_nb, target_names=["Fake", "Real"]))


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_nb)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Fake', 'Real'],
            yticklabels=['Fake', 'Real'])

plt.title("Macierz pomyłek – Naive Bayes")
plt.xlabel("Przewidywana etykieta")
plt.ylabel("Rzeczywista etykieta")
plt.tight_layout()
plt.show()


In [None]:

from wordcloud import WordCloud

# Chmura słów dla Fake news
fake_text = " ".join(df[df['label'] == 0]['clean_text'])
wordcloud_fake = WordCloud(width=800, height=400, background_color='white').generate(fake_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_fake, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud – Fake News")
plt.show()

# Chmura słów dla Real news
real_text = " ".join(df[df['label'] == 1]['clean_text'])
wordcloud_real = WordCloud(width=800, height=400, background_color='white').generate(real_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_real, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud – Real News")
plt.show()


In [None]:

# Funkcja predykcji nowego tekstu
def predict_article(text):
    cleaned = clean_text(text)
    vector = tfidf.transform([cleaned])
    prediction = nb_model.predict(vector)
    return "REAL" if prediction[0] == 1 else "FAKE"

# Przykład użycia
sample_text = "Scientists have discovered a new planet that could support life."
print("Wynik predykcji:", predict_article(sample_text))
