In [20]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


In [21]:
data = pd.read_csv("fake_news_dataset.csv")

print("Sample data:")
print(data.head())

Sample data:
                                  title  \
0               Foreign Democrat final.   
1   To offer down resource great point.   
2          Himself church myself carry.   
3                  You unit its should.   
4  Billion believe employee summer how.   

                                                text        date    source  \
0  more tax development both store agreement lawy...  2023-03-10  NY Times   
1  probably guess western behind likely next inve...  2022-05-25  Fox News   
2  them identify forward present success risk sev...  2022-09-01       CNN   
3  phone which item yard Republican safe where po...  2023-02-07   Reuters   
4  wonder myself fact difficult course forget exa...  2023-04-03       CNN   

                 author    category label  
0          Paula George    Politics  real  
1           Joseph Hill    Politics  fake  
2        Julia Robinson    Business  fake  
3  Mr. David Foster DDS     Science  fake  
4         Austin Walker  Technology  fa

In [22]:
def clean_text(text):
    text = re.sub(r"http\S+", " URL ", text)   # keep placeholder for URLs
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # keep numbers and letters
    text = text.lower()
    return text

# -------------------
# Preprocess data
# -------------------
data["text"] = data["text"].astype(str).apply(clean_text)

# normalize labels (handle 'FAKE'/'fake', 'REAL'/'real')
y = data["label"].str.lower()
X = data["text"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [23]:
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [24]:
model = LogisticRegression(max_iter=1000, class_weight="balanced")
model.fit(X_train_vec, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [25]:
y_pred = model.predict(X_test_vec)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.509

Classification Report:
              precision    recall  f1-score   support

        fake       0.52      0.50      0.51      2029
        real       0.50      0.52      0.51      1971

    accuracy                           0.51      4000
   macro avg       0.51      0.51      0.51      4000
weighted avg       0.51      0.51      0.51      4000


Confusion Matrix:
[[1014 1015]
 [ 949 1022]]


In [26]:
joblib.dump(model, "fake_news_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
print("\nModel and vectorizer saved!")



Model and vectorizer saved!


In [27]:
def predict_news(news_text):
    model = joblib.load("fake_news_model.pkl")
    vectorizer = joblib.load("vectorizer.pkl")
    
    news_text = clean_text(news_text)
    news_vec = vectorizer.transform([news_text])
    prediction = model.predict(news_vec)[0]  # returns 'fake' or 'real'
    
    return "Real News ✅" if prediction == "real" else "Fake News ❌"


In [30]:
new_news = "more tax development both store agreement lawyer hear outside continue reach difference yeah figure your power fear identify there protect security great national nothing fast story why late nearly bit cost tough since question to power almost future young conference behind ahead building teach million box receive Mrs risk benefit month compare environment class imagine you vote community reason set once idea him answer many how purpose deep training game own true language garden of partner result face military discover discover data glass bed maintain test way development across top culture glass yes decision hope necessary as trade organization talk debate peace stay community development six wide write itself several fight teach billion for common fear we personal church establish store kind hundred debate hotel cut sister audience sound case that stay within information trouble be debate great themselves responsibility force people hundred bar miss others sometimes build room interesting however charge what especially north no especially us travel industry about including face ten behind black series place age soldier early trouble middle would along case what money significant sound song reason poor free want thank cultural range shoulder rest movie political fear hear past leader up edge professor determine law act change middle prove say notice travel open director argue economic seven game matter season"
print("\nTest Prediction:", predict_news(new_news))



Test Prediction: Real News ✅


In [29]:
df = pd.read_csv("fake_news_dataset.csv")
print(df.columns)
print(df.head())
print(df['label'].value_counts())
first_row_text = df.iloc[0]["text"]   # Replace "text" with your column name
print(first_row_text)

Index(['title', 'text', 'date', 'source', 'author', 'category', 'label'], dtype='object')
                                  title  \
0               Foreign Democrat final.   
1   To offer down resource great point.   
2          Himself church myself carry.   
3                  You unit its should.   
4  Billion believe employee summer how.   

                                                text        date    source  \
0  more tax development both store agreement lawy...  2023-03-10  NY Times   
1  probably guess western behind likely next inve...  2022-05-25  Fox News   
2  them identify forward present success risk sev...  2022-09-01       CNN   
3  phone which item yard Republican safe where po...  2023-02-07   Reuters   
4  wonder myself fact difficult course forget exa...  2023-04-03       CNN   

                 author    category label  
0          Paula George    Politics  real  
1           Joseph Hill    Politics  fake  
2        Julia Robinson    Business  fake  
3  Mr.

In [31]:
# Filter rows where the 'label' column is 'real'
df_real = df[df['label'] == 'real']
print(df_real)


                                                  title  \
0                               Foreign Democrat final.   
5      Method purpose mission approach professor short.   
9               Reason physical contain total decision.   
10              Whose particular contain current those.   
11                   Region itself from machine forget.   
...                                                 ...   
19992                       Rule participant catch her.   
19993               Money himself challenge once along.   
19994                  Blue believe enjoy north return.   
19996             Though nation people maybe price box.   
19997                   Yet exist with experience unit.   

                                                    text        date  \
0      more tax development both store agreement lawy...  2023-03-10   
5      affect too bill whether kind project turn offi...  2023-12-02   
9      choose anything treat beyond political minute ...  2022-11-08   
10 

In [39]:

pd.set_option('display.max_colwidth', None)
real_texts_10 = df[df['label'] == 'real']['text'].head(10)

print(real_texts_10)

0                                                                                                                                                                                                                                                                                                                                               more tax development both store agreement lawyer hear outside continue reach difference yeah figure your power fear identify there protect security great national nothing fast story why late nearly bit cost tough since question to power almost future young conference behind ahead building teach million box receive Mrs risk benefit month compare environment class imagine you vote community reason set once idea him answer many how purpose deep training game own true language garden of partner result face military discover discover data glass bed maintain test way development across top culture glass yes decision hope necessary as trade organization talk deba