In [3]:
!pip install pandas scikit-learn nltk



In [4]:
import pandas as pd

# Load datasets
fake = pd.read_csv('Fake.csv')
real = pd.read_csv('True.csv')

# Add a label column: 0 = fake, 1 = real
fake['label'] = 0
real['label'] = 1

# Combine the datasets
data = pd.concat([fake, real], axis=0)

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Quick look
print(data.head())
print(data['label'].value_counts())

                                               title  \
0   BREAKING: Israel’s “Worst Fears Confirmed,” S...   
1  U.N. offers to help resolve Baghdad, Kurdistan...   
2  Trump Transition: As Secretary of State, Tulsi...   
3  UNIVERSITY PRESIDENT APOLOGIZES TO TRAUMATIZED...   
4  ERIC HOLDER Encourages DOJ To Keep Attacking T...   

                                                text    subject  \
0  In a bombshell revelation, The New York Times ...       News   
1  BAGHDAD (Reuters) - The United Nations has off...  worldnews   
2  Patrick Henningsen 21st Century WireSo far as ...    US_News   
4  The most corrupt Attorney General in the histo...  left-news   

                  date  label  
0         May 16, 2017      0  
1  September 28, 2017       1  
2    November 23, 2016      0  
3         Jul 18, 2016      0  
4          Jul 1, 2017      0  
label
0    23481
1    21417
Name: count, dtype: int64


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize and remove stopwords + lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply to your dataset
data['processed_text'] = data['text'].apply(preprocess_text)

print(data['processed_text'].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


0    bombshell revelation new york time named israe...
1    baghdad reuters united nation offered help sol...
2    patrick henningsen 21st century wireso far wor...
4    corrupt attorney general history united state ...
Name: processed_text, dtype: object


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the vectorizer
vectorizer = TfidfVectorizer(max_df=0.7)

# Fit and transform the text
X = vectorizer.fit_transform(data['processed_text'])

# Labels
y = data['label']

In [7]:
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Train size: (35918, 219464)
Test size: (8980, 219464)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9864142538975501

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4675
           1       0.99      0.99      0.99      4305

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
 [[4611   64]
 [  58 4247]]


In [9]:
# Enter your own news text here
input_news = "Breaking: Government announces new economic policy to support farmers."

# Preprocess the input
processed_input = preprocess_text(input_news)

# Vectorize the input using the same TF-IDF vectorizer
vectorized_input = vectorizer.transform([processed_input])

# Predict using the trained model
prediction = model.predict(vectorized_input)[0]

# Show result
result = "🟢 Real News" if prediction == 1 else "🔴 Fake News"
print("Prediction:", result)

Prediction: 🔴 Fake News


In [10]:
input_news = input("Enter a news article or headline: ")

processed_input = preprocess_text(input_news)
vectorized_input = vectorizer.transform([processed_input])
prediction = model.predict(vectorized_input)[0]
result = "🟢 Real News" if prediction == 1 else "🔴 Fake News"
print("Prediction:", result)

Enter a news article or headline: NASA announces water on mars
Prediction: 🔴 Fake News


In [11]:
import pickle

# Save model and vectorizer
pickle.dump(model, open('fake_news_model.pkl', 'wb'))
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))