In [4]:
import pandas as pd


In [17]:
print(df.isnull().sum())


title    2961
text     3609
label    4321
dtype: int64


In [18]:
# Combine 'title' and 'text' into a single column
df['content'] = df['title'].fillna('') + ' ' + df['text'].fillna('')

# Drop the original columns to keep only useful ones
df = df[['content', 'label']]


In [19]:
import re

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove digits
    return text

df['content'] = df['content'].apply(clean_text)


In [20]:
from sklearn.model_selection import train_test_split

X = df['content']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import pickle

# Save the model
with open('fake_news_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)


In [23]:
print("X_train_tfidf shape:", X_train_tfidf.shape)
print("y_train shape:", y_train.shape)


X_train_tfidf shape: (62477, 372028)
y_train shape: (62477,)


In [24]:
print(df.isnull().sum())


content       0
label      4321
dtype: int64


In [25]:
# Drop rows with missing labels
df = df.dropna(subset=['label'])

# Optional: Reset index to avoid alignment issues
df = df.reset_index(drop=True)


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform the training data, transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [30]:
print(df.columns)


Index(['content', 'label'], dtype='object')


In [31]:
df = df.dropna(subset=['content', 'label'])
df = df.reset_index(drop=True)


In [33]:
df = df.dropna(subset=['label'])


In [35]:
print(df['label'].unique())


['1' '0'
 ' Duncan explained â€œBaiame came from a place that we call the Morning Star within the Mirrabooka. Mira means stars and booka means river. That is the Milky Way that flows across the North Star. â€\x9d43 Baiame'
 ...
 ' regardless of age differences. Itâ€™s a more civil way of life. There are loaded pistols'
 ' thanks entirely to the Marxist liberals. Fembot220 ' ' google']


In [36]:
# Keep only rows where 'label' is either '0' or '1'
df = df[df['label'].isin(['0', '1'])]

# Now convert the 'label' column from string to integer
df['label'] = df['label'].astype(int)


In [37]:
print(df['label'].unique())  # should print only: [0 1]
print(df.isnull().sum())     # make sure no NaNs remain


[1 0]
content    0
label      0
dtype: int64


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [47]:
print(df['label'].unique())


[]


In [None]:
# Step 1: Reload data
df = pd.read_csv("dataset/WELFake_Dataset.csv")

# Step 2: Drop columns that are completely NaN
df = df.dropna(axis=1, how='all')

# Step 3: Keep only useful columns
df = df[['title', 'text', 'label']]

# Step 4: Merge title and text into 'content'
df['content'] = df['title'].astype(str) + " " + df['text'].astype(str)

# Step 5: Drop rows where label is null
df = df.dropna(subset=['label'])

# Step 6: Check unique values before filtering
print(df['label'].unique())  # << LOOK AT THIS OUTPUT

# Step 7: Filter only valid binary values
df = df[df['label'].astype(str).isin(['0', '1'])]

# Step 8: Convert to int
df['label'] = df['label'].astype(int)

# Step 9: Final check
print(df['label'].unique())


In [None]:
X = df['content']
y = df['label']


In [None]:
print(df['label'].value_counts())


In [None]:
# Keep only 'content' and 'label' columns
df = df[['title', 'text', 'label']]
df['content'] = df['title'].astype(str) + " " + df['text'].astype(str)

# Drop rows with missing content or label
df = df.dropna(subset=['content', 'label'])

# Now split the data
from sklearn.model_selection import train_test_split
X = df['content']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model training
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
print(y_pred[:10])  # see first 10 predictions


In [None]:
import joblib

# Save the model
joblib.dump(model, 'fake_news_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("✅ Model and vectorizer saved successfully.")


In [None]:
# Load the model and vectorizer
model = joblib.load('fake_news_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Example prediction
sample_text = ["Breaking: PM announces new scheme..."]
sample_tfidf = vectorizer.transform(sample_text)
prediction = model.predict(sample_tfidf)

print("Prediction:", prediction[0])  # 0 = Real, 1 = Fake


In [None]:
user_input = input("tomorrow is going to rain: ")
sample_tfidf = vectorizer.transform([user_input])
result = model.predict(sample_tfidf)[0]

if result == 1:
    print("🔴 This news is FAKE.")
else:
    print("🟢 This news is REAL.")


In [None]:
import tkinter as tk
from tkinter import messagebox
import joblib

# Load the saved model and vectorizer
model = joblib.load("fake_news_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Predict function
def detect_fake_news():
    news_text = entry.get("1.0", tk.END).strip()  # Get text from input box
    if not news_text:
        messagebox.showwarning("Input Error", "Please enter some news content.")
        return
    
    transformed_text = vectorizer.transform([news_text])
    prediction = model.predict(transformed_text)[0]
    
    if prediction == 1:
        result_label.config(text="❌ This news is FAKE!", fg="red")
    else:
        result_label.config(text="✅ This news is REAL!", fg="green")

# Create GUI window
root = tk.Tk()
root.title("Fake News Detector")
root.geometry("500x300")
root.configure(bg="white")

# Heading
tk.Label(root, text="Fake News Detector", font=("Helvetica", 18, "bold"), bg="white").pack(pady=10)

# Text entry box
entry = tk.Text(root, height=7, width=55, font=("Arial", 11))
entry.pack(pady=10)

# Predict button
tk.Button(root, text="Check News", font=("Arial", 12), command=detect_fake_news).pack(pady=5)

# Result label
result_label = tk.Label(root, text="", font=("Arial", 14, "bold"), bg="white")
result_label.pack(pady=10)

# Run the GUI loop
root.mainloop()


In [None]:
user_input = input("tomorrow is holidayy enjoyyy!!: ")
sample_tfidf = vectorizer.transform([user_input])
result = model.predict(sample_tfidf)[0]

if result == 1:
    print("🔴 This news is FAKE.")
else:
    print("🟢 This news is REAL.")


In [None]:
python fake_news_gui.py
