# Part 1: Data Processing

In [None]:
#importing Libraries
import re
import pandas as pd
import nltk
import csv
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

## Task 1

In [None]:
#Read the csv file
df = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv')

#cleaned_content = ""

def clean_text(txt):
    #lowercasing text
    txt = txt.lower()
    #Removing excess whitespace, tabs and new lines
    txt = re.sub(r"\s+", " ", txt)
    #remove special characters
    txt = re.sub(r"[^\w\s]", "", txt)
    #numbers replaced with "<NUM>"#
    txt = re.sub(r"\d+", "<NUM>", txt)
    #Dates replaced with "<DATE>"#
    txt = re.sub(r"\d{4}-\d{2}-\d{2}", "<DATE>", txt)
    #Emails replaces with "<EMAIL>"#
    txt = re.sub(r"\S+@\S+", "<EMAIL>", txt)
    #URLS replaces with "<URL>"
    txt = re.sub(r"http\S+", "<URL>", txt)

    return txt

#we want to apply the clean text to content column in the dataframe#
df["clean_news1"] = df["content"].apply(clean_text)

# Tokenize the text
df['tokens'] = df['clean_news1'].apply(nltk.word_tokenize)


# Remove stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stopwords])

# Compute the size of the vocabulary before and after removing stopwords
vocab_before = len(set([word for tokens in df['tokens'] for word in tokens]))
df['tokens_no_stopwords'] = df['tokens'].apply(lambda x: [word for word in x if word not in stopwords])
vocab_after = len(set([word for tokens in df['tokens_no_stopwords'] for word in tokens]))
reduction_rate_stopwords = (vocab_before - vocab_after) / vocab_before

# Apply stemming
stemmer = nltk.stem.SnowballStemmer('english')
df['tokens_stemmed'] = df['tokens_no_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])

# Compute the size of the vocabulary before and after stemming
vocab_before = len(set([word for tokens in df['tokens_no_stopwords'] for word in tokens]))
vocab_after = len(set([word for tokens in df['tokens_stemmed'] for word in tokens]))
reduction_rate_stemming = (vocab_before - vocab_after) / vocab_before

print(f"Vocabulary size before removing stopwords: {vocab_before}")
print(f"Vocabulary size after removing stopwords: {vocab_after}")
print(f"Reduction rate after removing stopwords: {reduction_rate_stopwords:.2%}")
print(f"Vocabulary size after stemming: {vocab_after}")
print(f"Reduction rate after stemming: {reduction_rate_stemming:.2%}")


# Create a frequency distribution of words in the dataset
all_words = [word for tokens in df['tokens_stemmed'] for word in tokens]
freq_dist = nltk.FreqDist(all_words)

# Print the 100 most common words
print("100 most common words in the data after removing stopwords and stemming:")
for word, frequency in freq_dist.most_common(100):
    print(f"{word}: {frequency}")




For task 1, we've used the libraries:

re: Which stands for regular expressions, which we use for filtering text. Using it to remove excess whitespace, tabs, new lines, urls, emails, numbers and dates. This is done so that the data is easier to explore.

pandas: Which is a libary which is used for reading and manipulating data, this is probably the most fundamental library we use for this task, as we use pandas to read the csv file.

nltk: Which is a library useful for our purpose of tokenizing and stemming the data.

csv: