## **1. Import Required Libraries**

In [7]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

## **2. Download NLTK Resources**

In [8]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/magnus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/magnus/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/magnus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## **3. Load the Dataset**

In [9]:
df=pd.read_csv('../data/raw/WELFake_Dataset.csv')

## **4. Remove Unnecessary Index Column**

In [10]:
df = df.drop(columns=['Unnamed: 0'], errors='ignore')

## **5. Handle Missing Values in Text Columns**

In [11]:
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')

## **6. Combine Title and Text into Single Content Column**

In [12]:
# Use text-only (no title) to match what the backend receives at inference
df['content'] = df['text'].fillna('')
df['content']

0        No comment is expected from Barack Obama Membe...
1           Did they post their votes for Hillary already?
2         Now, most of the demonstrators gathered last ...
3        A dozen politically active pastors came here f...
4        The RS-28 Sarmat missile, dubbed Satan 2, will...
                               ...                        
72129    WASHINGTON (Reuters) - Hackers believed to be ...
72130    You know, because in fantasyland Republicans n...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    MEXICO CITY (Reuters) - Donald Trumpâ€™s combati...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: content, Length: 72134, dtype: object

## **7. Convert Text to Lowercase**

In [13]:
df['content'] = df['content'].str.lower()

## **8. Remove Punctuation and Special Characters**

In [14]:
def remove_punctuation(text):
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    return text

df['content'] = df['content'].apply(remove_punctuation)

## **9. Remove Extra White Spaces**

In [15]:
df['content'] = df['content'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

## **10. Load Stopwords**

In [16]:
stop_words = set(stopwords.words('english'))

## **11. Remove Stopwords**

In [17]:
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

df['cleaned_content'] = df['content'].apply(remove_stopwords)

## **12. Tokenization**

In [23]:
df['tokens'] = df['cleaned_content'].apply(word_tokenize)
df['tokens'].head()

0    [comment, expected, barack, obama, members, fy...
1                      [post, votes, hillary, already]
2    [demonstrators, gathered, last, night, exercis...
3    [dozen, politically, active, pastors, came, pr...
4    [rs, sarmat, missile, dubbed, satan, replace, ...
Name: tokens, dtype: object

## **13. Comparison of Content**

In [19]:
df[['content', 'cleaned_content']].head(3)

Unnamed: 0,content,cleaned_content
0,no comment is expected from barack obama membe...,comment expected barack obama members fyf fuky...
1,did they post their votes for hillary already,post votes hillary already
2,now most of the demonstrators gathered last ni...,demonstrators gathered last night exercising c...


## **14. Check Final Null Values**

In [20]:
df.isnull().sum()

title              0
text               0
label              0
content            0
cleaned_content    0
tokens             0
dtype: int64

## **15. Save Clean Dataset**

In [21]:
df.to_csv("../data/processed/cleaned_news.csv", index=False)

## **16. Conclusion**

The text preprocessing pipeline successfully cleaned and normalized the news articles by:

*   **Normalization**: Converting text to lowercase.
*   **Sanitization**: Removing punctuation and special characters.
*   **Filtering**: Eliminating stopwords.
*   **Noise Reduction**: Reducing inconsistencies in the text.