#   2. Data Cleaning & Preprocessing

## Clean the combined dataset

## 1. Load Dataset

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import string


In [2]:
combined_fake_news_dataset_df = pd.read_csv("../data/interim/combined_fake_news_dataset.csv")

In [3]:
combined_fake_news_dataset_df.head()

Unnamed: 0,text,label,original_label,dataset
0,(Reuters) - President Donald Trump is set to s...,1,1,ISOT
1,"LANY, Czech Republic (Reuters) - Czech electio...",1,1,ISOT
2,Says comprehensive immigration reform will add...,0,half-true,LIAR
3,Inside Nikki Bella's Strong Support System Fol...,1,1,FakeNewsNet_Minimal
4,Ellen DeGeneres makes joke about Jennifer Anis...,0,0,FakeNewsNet_Minimal


In [4]:
combined_fake_news_dataset_df.tail()

Unnamed: 0,text,label,original_label,dataset
80880,"Says Rep. Maxine Waters, D-Calif., only needs ...",0,half-true,LIAR
80881,Remember when colleges and university were one...,0,0,ISOT
80882,Yolanda Hadid Addresses Gigi Hadid Pregnancy R...,0,0,FakeNewsNet_Minimal
80883,The Governor did not consult members of his ow...,0,false,LIAR
80884,WASHINGTON (Reuters) - The U.S. Senate will pu...,1,1,ISOT


In [5]:
combined_fake_news_dataset_df.describe()

Unnamed: 0,label
count,80885.0
mean,0.536132
std,0.498696
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [6]:
combined_fake_news_dataset_df.duplicated().sum()

np.int64(7620)

In [7]:
# missing values

combined_fake_news_dataset_df.isnull().sum()

text              0
label             0
original_label    0
dataset           0
dtype: int64

## 2. Drop Duplicates

In [8]:
combined_fake_news_dataset_df.drop_duplicates(inplace=True)

In [9]:
combined_fake_news_dataset_df

Unnamed: 0,text,label,original_label,dataset
0,(Reuters) - President Donald Trump is set to s...,1,1,ISOT
1,"LANY, Czech Republic (Reuters) - Czech electio...",1,1,ISOT
2,Says comprehensive immigration reform will add...,0,half-true,LIAR
3,Inside Nikki Bella's Strong Support System Fol...,1,1,FakeNewsNet_Minimal
4,Ellen DeGeneres makes joke about Jennifer Anis...,0,0,FakeNewsNet_Minimal
...,...,...,...,...
80879,"Conservatives are, once again, on the losing s...",0,0,ISOT
80880,"Says Rep. Maxine Waters, D-Calif., only needs ...",0,half-true,LIAR
80882,Yolanda Hadid Addresses Gigi Hadid Pregnancy R...,0,0,FakeNewsNet_Minimal
80883,The Governor did not consult members of his ow...,0,false,LIAR


## 3. Clean Text

In [10]:
# Clean text
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML
    text = re.sub(r"http\S+|www\S+", "", text)            # Remove URLs
    text = text.lower()                                   # Lowercase
    text = re.sub(r'\[.*?\]', '', text)                   # Remove text in brackets
    text = re.sub(r'\w*\d\w*', '', text)                  # Remove words with numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()              # Remove extra whitespace
    return text

In [11]:
combined_fake_news_dataset_df['clean_text'] =  combined_fake_news_dataset_df['text'].apply(clean_text)


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML


In [12]:
combined_fake_news_dataset_df.head()

Unnamed: 0,text,label,original_label,dataset,clean_text
0,(Reuters) - President Donald Trump is set to s...,1,1,ISOT,reuters president donald trump is set to sign ...
1,"LANY, Czech Republic (Reuters) - Czech electio...",1,1,ISOT,lany czech republic reuters czech election win...
2,Says comprehensive immigration reform will add...,0,half-true,LIAR,says comprehensive immigration reform will add...
3,Inside Nikki Bella's Strong Support System Fol...,1,1,FakeNewsNet_Minimal,inside nikki bellas strong support system foll...
4,Ellen DeGeneres makes joke about Jennifer Anis...,0,0,FakeNewsNet_Minimal,ellen degeneres makes joke about jennifer anis...


## 4. Tokenization

In [13]:
combined_fake_news_dataset_df['tokens'] = combined_fake_news_dataset_df['clean_text'].apply(lambda x: x.split())


In [14]:
combined_fake_news_dataset_df

Unnamed: 0,text,label,original_label,dataset,clean_text,tokens
0,(Reuters) - President Donald Trump is set to s...,1,1,ISOT,reuters president donald trump is set to sign ...,"[reuters, president, donald, trump, is, set, t..."
1,"LANY, Czech Republic (Reuters) - Czech electio...",1,1,ISOT,lany czech republic reuters czech election win...,"[lany, czech, republic, reuters, czech, electi..."
2,Says comprehensive immigration reform will add...,0,half-true,LIAR,says comprehensive immigration reform will add...,"[says, comprehensive, immigration, reform, wil..."
3,Inside Nikki Bella's Strong Support System Fol...,1,1,FakeNewsNet_Minimal,inside nikki bellas strong support system foll...,"[inside, nikki, bellas, strong, support, syste..."
4,Ellen DeGeneres makes joke about Jennifer Anis...,0,0,FakeNewsNet_Minimal,ellen degeneres makes joke about jennifer anis...,"[ellen, degeneres, makes, joke, about, jennife..."
...,...,...,...,...,...,...
80879,"Conservatives are, once again, on the losing s...",0,0,ISOT,conservatives are once again on the losing sid...,"[conservatives, are, once, again, on, the, los..."
80880,"Says Rep. Maxine Waters, D-Calif., only needs ...",0,half-true,LIAR,says rep maxine waters dcalif only needs about...,"[says, rep, maxine, waters, dcalif, only, need..."
80882,Yolanda Hadid Addresses Gigi Hadid Pregnancy R...,0,0,FakeNewsNet_Minimal,yolanda hadid addresses gigi hadid pregnancy r...,"[yolanda, hadid, addresses, gigi, hadid, pregn..."
80883,The Governor did not consult members of his ow...,0,false,LIAR,the governor did not consult members of his ow...,"[the, governor, did, not, consult, members, of..."


## 5. Remove stopwords

In [15]:
stop_words = set(stopwords.words('english'))
combined_fake_news_dataset_df['tokens'] = combined_fake_news_dataset_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

## 6. Lemmatization

In [16]:
lemmatizer = WordNetLemmatizer()
combined_fake_news_dataset_df['tokens'] = combined_fake_news_dataset_df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

## 7. Rejoin tokens into final processed text

In [17]:
combined_fake_news_dataset_df['processed_text'] = combined_fake_news_dataset_df['tokens'].apply(lambda x: ' '.join(x))

In [18]:
combined_fake_news_dataset_df

Unnamed: 0,text,label,original_label,dataset,clean_text,tokens,processed_text
0,(Reuters) - President Donald Trump is set to s...,1,1,ISOT,reuters president donald trump is set to sign ...,"[reuters, president, donald, trump, set, sign,...",reuters president donald trump set sign order ...
1,"LANY, Czech Republic (Reuters) - Czech electio...",1,1,ISOT,lany czech republic reuters czech election win...,"[lany, czech, republic, reuters, czech, electi...",lany czech republic reuters czech election win...
2,Says comprehensive immigration reform will add...,0,half-true,LIAR,says comprehensive immigration reform will add...,"[say, comprehensive, immigration, reform, add,...",say comprehensive immigration reform add billi...
3,Inside Nikki Bella's Strong Support System Fol...,1,1,FakeNewsNet_Minimal,inside nikki bellas strong support system foll...,"[inside, nikki, bellas, strong, support, syste...",inside nikki bellas strong support system foll...
4,Ellen DeGeneres makes joke about Jennifer Anis...,0,0,FakeNewsNet_Minimal,ellen degeneres makes joke about jennifer anis...,"[ellen, degeneres, make, joke, jennifer, anist...",ellen degeneres make joke jennifer aniston mar...
...,...,...,...,...,...,...,...
80879,"Conservatives are, once again, on the losing s...",0,0,ISOT,conservatives are once again on the losing sid...,"[conservative, losing, side, conversation, day...",conservative losing side conversation day pres...
80880,"Says Rep. Maxine Waters, D-Calif., only needs ...",0,half-true,LIAR,says rep maxine waters dcalif only needs about...,"[say, rep, maxine, water, dcalif, need, vote, ...",say rep maxine water dcalif need vote win elec...
80882,Yolanda Hadid Addresses Gigi Hadid Pregnancy R...,0,0,FakeNewsNet_Minimal,yolanda hadid addresses gigi hadid pregnancy r...,"[yolanda, hadid, address, gigi, hadid, pregnan...",yolanda hadid address gigi hadid pregnancy rumor
80883,The Governor did not consult members of his ow...,0,false,LIAR,the governor did not consult members of his ow...,"[governor, consult, member, party, released, p...",governor consult member party released plan


In [19]:
import os
output_dir = "../data/processed"
os.makedirs(output_dir, exist_ok=True)

# Save the file
output_path = os.path.join(output_dir, "cleaned_fake_news_dataset.csv")
combined_fake_news_dataset_df.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to: {output_path}")


Cleaned dataset saved to: ../data/processed\cleaned_fake_news_dataset.csv


In [20]:
df = pd.read_csv("../data/processed/cleaned_fake_news_dataset.csv")

In [21]:
df

Unnamed: 0,text,label,original_label,dataset,clean_text,tokens,processed_text
0,(Reuters) - President Donald Trump is set to s...,1,1,ISOT,reuters president donald trump is set to sign ...,"['reuters', 'president', 'donald', 'trump', 's...",reuters president donald trump set sign order ...
1,"LANY, Czech Republic (Reuters) - Czech electio...",1,1,ISOT,lany czech republic reuters czech election win...,"['lany', 'czech', 'republic', 'reuters', 'czec...",lany czech republic reuters czech election win...
2,Says comprehensive immigration reform will add...,0,half-true,LIAR,says comprehensive immigration reform will add...,"['say', 'comprehensive', 'immigration', 'refor...",say comprehensive immigration reform add billi...
3,Inside Nikki Bella's Strong Support System Fol...,1,1,FakeNewsNet_Minimal,inside nikki bellas strong support system foll...,"['inside', 'nikki', 'bellas', 'strong', 'suppo...",inside nikki bellas strong support system foll...
4,Ellen DeGeneres makes joke about Jennifer Anis...,0,0,FakeNewsNet_Minimal,ellen degeneres makes joke about jennifer anis...,"['ellen', 'degeneres', 'make', 'joke', 'jennif...",ellen degeneres make joke jennifer aniston mar...
...,...,...,...,...,...,...,...
73260,"Conservatives are, once again, on the losing s...",0,0,ISOT,conservatives are once again on the losing sid...,"['conservative', 'losing', 'side', 'conversati...",conservative losing side conversation day pres...
73261,"Says Rep. Maxine Waters, D-Calif., only needs ...",0,half-true,LIAR,says rep maxine waters dcalif only needs about...,"['say', 'rep', 'maxine', 'water', 'dcalif', 'n...",say rep maxine water dcalif need vote win elec...
73262,Yolanda Hadid Addresses Gigi Hadid Pregnancy R...,0,0,FakeNewsNet_Minimal,yolanda hadid addresses gigi hadid pregnancy r...,"['yolanda', 'hadid', 'address', 'gigi', 'hadid...",yolanda hadid address gigi hadid pregnancy rumor
73263,The Governor did not consult members of his ow...,0,false,LIAR,the governor did not consult members of his ow...,"['governor', 'consult', 'member', 'party', 're...",governor consult member party released plan
