In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
#Load data
df_fake = pd.read_csv("../data/raw/Fake.csv")
df_true = pd.read_csv("../data/raw/True.csv")

In [3]:
#display few rows
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
print("Fake News Shape:", df_fake.shape)
print("Real News Shape:", df_true.shape)
print("Fake News Columns:", df_fake.columns)
print("Real News Columns:", df_true.columns)

Fake News Shape: (23481, 4)
Real News Shape: (21417, 4)
Fake News Columns: Index(['title', 'text', 'subject', 'date'], dtype='object')
Real News Columns: Index(['title', 'text', 'subject', 'date'], dtype='object')


In [6]:
#Add label
df_fake["label"] = 1
df_true["label"] = 0

In [7]:
#combine the data
df = pd.concat([df_fake, df_true])

In [8]:
#shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0


In [10]:
#Check missing values
df.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [11]:
#remove unecessary columns
df = df[["title", "text", "label"]]

In [12]:
df.head()

Unnamed: 0,title,text,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",0


In [13]:
#check duplicates
print(f"Total Duplicates: {df.duplicated().sum()}")

Total Duplicates: 5793


In [14]:
#remove duplicates
df = df.drop_duplicates()

In [15]:
print(f"Total Duplicates after cleaning: {df.duplicated().sum()}")

Total Duplicates after cleaning: 0


In [16]:
#download stopwords and wordnet
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
# Initialize stopwords & lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [21]:
#function to clean the text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

In [23]:
#apply the text cleaning to title and text
df["title"] = df["title"].astype(str).map(clean_text)
df["text"] = df["text"].astype(str).map(clean_text)

In [18]:
#function to clean the text
def clean_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', " ", text).strip()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words("english")]
    return " ".join(words)

In [19]:
#apply the text cleaning to title and text
df["title"] = df["title"].apply(clean_text)
df["text"] = df["text"].apply(clean_text)

KeyboardInterrupt: 

In [24]:
df.head()

Unnamed: 0,title,text,label
0,ben stein call th circuit court committed coup...,st century wire say ben stein reputable profes...,1
1,trump drop steve bannon national security council,washington reuters u president donald trump re...,0
2,puerto rico expects u lift jones act shipping ...,reuters puerto rico governor ricardo rossello ...,0
3,oops trump accidentally confirmed leaked israe...,monday donald trump embarrassed country accide...,1
4,donald trump head scotland reopen golf resort,glasgow scotland reuters u presidential candid...,0


In [15]:
#save processed data
df.to_csv("../data/processed/cleaned_news.csv", index=False)