## **1. Import Required Libraries**

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

## **2. Download NLTK Resources**

In [3]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/magnus/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/magnus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## **3. Load the Dataset**

In [4]:
df=pd.read_csv('../data/raw/WELFake_Dataset.csv')

## **4. Remove Unnecessary Index Column**

In [5]:
df = df.drop(columns=['Unnamed: 0'], errors='ignore')

## **5. Handle Missing Values in Text Columns**

In [6]:
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')

## **6. Combine Title and Text into Single Content Column**

In [7]:
df['content'] = df['title'] + " " + df['text']

## **7. Convert Text to Lowercase**

In [8]:
df['content'] = df['content'].str.lower()

## **8. Remove Punctuation and Special Characters**

In [9]:
def remove_punctuation(text):
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    return text

df['content'] = df['content'].apply(remove_punctuation)

## **9. Remove Extra White Spaces**

In [10]:
df['content'] = df['content'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

## **10. Load Stopwords**

In [12]:
stop_words = set(stopwords.words('english'))

## **11. Remove Stopwords**

In [13]:
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

df['cleaned_content'] = df['content'].apply(remove_stopwords)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/magnus/nltk_data'
    - '/Users/magnus/Desktop/news-credibility-ai/venv/nltk_data'
    - '/Users/magnus/Desktop/news-credibility-ai/venv/share/nltk_data'
    - '/Users/magnus/Desktop/news-credibility-ai/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## **12. Tokenization**

In [14]:
df['tokens'] = df['cleaned_content'].apply(word_tokenize)
df[['cleaned_content', 'tokens']].head()

KeyError: 'cleaned_content'

## **13. Comparison of Content**

In [15]:
df[['content', 'cleaned_content']].head(3)

KeyError: "['cleaned_content'] not in index"

## **14. Check Final Null Values**

In [None]:
df.isnull().sum()

## **15. Save Clean Dataset**

In [None]:
df.to_csv("../data/processed/cleaned_news.csv", index=False)

## **16. Final Dataset Export**

In [None]:
import os
os.makedirs("../data/processed", exist_ok=True)
df.to_csv("../data/processed/cleaned_news.csv", index=False)

## **17. Conclusion**

> ### **Conclusion Summary**
> The text preprocessing pipeline has successfully normalized the dataset by performing the following core transformations:
>
> *   **Normalization**: Converted all content to lowercase to ensure consistency across the corpus.
> *   **Sanitization**: Removed punctuation and special characters to reduce noise.
> *   **Tokenization**: Segmented the cleaned text into individual word tokens.
> *   **Filtering**: Removed common English stopwords to focus on information-dense terms.
>
> **Next Steps**: The preprocessed data is now ready for **Feature Extraction** where we will convert these tokens into numerical vectors (e.g., using TF-IDF or Word Embeddings).