# Preprocessing the data 

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Load the combined dataset
df = pd.read_csv(r"C:\Users\hp\Desktop\Projects\Fake_news_detection\data\combined_news.csv")

In [3]:

# Download necessary NLTK resources (run once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define text preprocessing function
def clean_text(text):
    text = str(text).lower()                              # Lowercase
    text = re.sub(r'[^\w\s]', '', text)                   # Remove punctuation
    words = nltk.word_tokenize(text)                      # Tokenize
    words = [w for w in words if w not in stop_words]     # Remove stopwords
    words = [lemmatizer.lemmatize(w) for w in words]      # Lemmatize
    return ' '.join(words).strip()                        # Join and remove extra spaces

# Apply preprocessing to the 'text' column
df['clean_text'] = df['text'].apply(clean_text)

# Preview results
print(df[['text', 'clean_text']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                text  \
0  WASHINGTON (Reuters) - Supreme Court justices ...   
1  WASHINGTON (Reuters) - Facing slumping poll nu...   
2  It s time the American people wake up to the d...   
3  If we didn t know better, we d almost believe ...   
4  Miss Universe 1996 Alicia Machado is now an Am...   

                                          clean_text  
0  washington reuters supreme court justice monda...  
1  washington reuters facing slumping poll number...  
2  time american people wake danger treasonous na...  
3  know better almost believe federal government ...  
4  miss universe 1996 alicia machado american cit...  
