In [19]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

In [21]:
# Download stopwords (only need to do this once)
# nltk.download('stopwords')

In [23]:
# Initialize stemmer and stopwords
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))

In [25]:
# Function to clean and preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    
    # Remove newline characters
    text = re.sub(r'\n', '', text)
    
    # Remove words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Apply stemming
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    return text


In [27]:
# Function to preprocess text in parallel
def preprocess_parallel(texts):
    return Parallel(n_jobs=-1)(delayed(preprocess_text)(text) for text in texts)

In [29]:
data_fake = pd.read_csv(r"C:\Users\monil\Desktop\Graduate Project\resources\datasets\Fake.csv")
data_true = pd.read_csv(r"C:\Users\monil\Desktop\Graduate Project\resources\datasets\True.csv")              

In [31]:
data_fake['label'] = 1  # 1 for fake news
data_true['label'] = 0  # 0 for real news

# Remove 'subject' and 'date' columns
data_fake = data_fake.drop(columns=['title','subject', 'date'])
data_true = data_true.drop(columns=['title','subject', 'date'])

In [33]:
data_fake.shape, data_true.shape

((23481, 2), (21417, 2))

In [35]:
# data_fake.head()

In [37]:
# data_true.head()

In [39]:
# Merge the datasets
data = pd.concat([data_fake, data_true], ignore_index=True)

# Shuffle the data
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [41]:
# Check for missing values
print("Missing values in 'text' column:", data['text'].isnull().sum())

# Drop rows with missing text
data = data.dropna(subset=['text'])

Missing values in 'text' column: 0


In [43]:
# Preprocess text in parallel
data['text'] = preprocess_parallel(data['text'])

In [44]:
# data.head()

In [53]:
X = data['text']
Y = data['label']

In [55]:
# sns.countplot(x='label', data=data)
# plt.title('Distribution of Fake (1) and Real (0) News')
# plt.show()

In [57]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [59]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [60]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

In [61]:
# Make predictions
predictions = model.predict(X_test_vectorized)

In [62]:
# model.score(X_test_vectorized, y_test) 

In [71]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print("\n\n")
print(classification_report(y_test, predictions))

Accuracy: 0.9848552338530067



              precision    recall  f1-score   support

           0       0.98      0.99      0.98      5362
           1       0.99      0.98      0.99      5863

    accuracy                           0.98     11225
   macro avg       0.98      0.99      0.98     11225
weighted avg       0.98      0.98      0.98     11225



In [64]:
import joblib

# Save the model and vectorizer
joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!
