# Data preprocessing

In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Download required nltk datasets
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Load dataset
true_data = pd.read_csv("True.csv")
fake_data = pd.read_csv("Fake.csv")

In [5]:
# Add labels
true_data["label"] = 1  # Real news
fake_data["label"] = 0  # Fake news

In [6]:
# Combine datasets
data = pd.concat([true_data, fake_data], axis=0).reset_index(drop=True)

In [7]:
# Text cleaning function
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# preprocessing function

In [8]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize
    text = [word for word in text if word not in stop_words]  # Remove stopwords
    text = [lemmatizer.lemmatize(word) for word in text]  # Lemmatization
    return " ".join(text)

# Applying preprocessing function

In [9]:
# Apply preprocessing
data["text"] = data["title"] + " " + data["text"]  # Combine title and body
data["text"] = data["text"].apply(preprocess_text)


# Splitting data into training and testing

In [11]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2, random_state=42)

# Convert text to numerical form using TF-IDF

In [12]:
# Convert text to numerical form using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)