In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Load the dataset
data = pd.read_csv('/content/news.csv')



In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [None]:
# Get the actual column names from the DataFrame
actual_columns = data.columns.tolist()

In [None]:
print(f"Actual columns in the DataFrame: {actual_columns}")


Actual columns in the DataFrame: ['Unnamed: 0', 'title', 'text', 'label', 'content']


In [None]:
# Adjust the subset in dropna based on the actual column names
# Assuming your desired columns are named differently, update the subset list
# For example, if the actual column names are 'title' and 'text', use:
data = data.dropna(subset=['title', 'text'])  # Replace with the actual column names if different

In [None]:
# Check for missing values
print("Missing values:\n", data.isnull().sum())

Missing values:
 Unnamed: 0    0
title         0
text          0
label         0
content       0
dtype: int64


In [None]:
# Combine Title and Text columns for more comprehensive context
# Update the column names here as well
data['content'] = data['title'] + " " + data['text'] # Replace with the actual column names if different

In [None]:
# Optimized text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply the function to the 'content' column
# Assuming your columns are actually named 'title' and 'text'
# Please adjust according to the actual column names in your DataFrame
data['content'] = data['title'] + " " + data['text']
data['content'] = data['content'].apply(clean_text)


In [None]:
# Encode labels (Fake = 0, Real = 1)
# Check if 'label' column exists (case-insensitive)
label_col = next((col for col in data.columns if col.lower() == 'label'), None)

# If 'label' column is found, proceed with encoding
if label_col:
    data['Label_Encoded'] = data[label_col].apply(lambda x: 1 if x.lower() == 'real' else 0)

    # Define features (X) and labels (y)
    X = data['content']
    y = data['Label_Encoded']  # Use the encoded label column

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
else:
    print("Error: 'Label' column not found in the DataFrame. Please check column names.")

In [None]:
# Initialize TF-IDF Vectorizer with max features to limit dimensionality
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Transform the training and test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Initialize and train Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [None]:
# Predict on the test set
y_pred = model.predict(X_test_tfidf)

# Calculate accuracy and display the classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Fake', 'Real'])

print("Model Accuracy:", accuracy)
print("Classification Report:\n", report)


Model Accuracy: 0.9187056037884768
Classification Report:
               precision    recall  f1-score   support

        Fake       0.91      0.93      0.92       628
        Real       0.93      0.91      0.92       639

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267



In [None]:
import joblib

# Save model and vectorizer
joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [None]:
# Load saved model and vectorizer
loaded_model = joblib.load('fake_news_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Example: Predicting on a new piece of text
new_text = ["Breaking news: A major event just unfolded!"]
new_text_tfidf = loaded_vectorizer.transform(new_text)
prediction = loaded_model.predict(new_text_tfidf)

print("Prediction:", "Real" if prediction[0] == 1 else "Fake")


Prediction: Fake
