In [1]:
import re
from cleantext import clean
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

In [3]:
def clean_tokens(df):
  for i in range(df.shape[0]):
    df.iat[i,3] = word_tokenize(clean(re.sub(r'[a-zA-Z]{3,9}\.? ?\d{1,2}(,|.) ?\d{2,4}', 'DATE', str(df.iat[i,3])), no_urls=True, no_emails=True, no_numbers=True, no_punct=True, replace_with_number='NUM', ))

def remove_stopwords(df):
  for i in range(df.shape[0]):
    df.iat[i,3] = [w for w in df.iat[i,3] if w not in stop_words]

def stem_words(df):
  for i in range(df.shape[0]):
    df.iat[i,3] = [stemmer.stem(word) for word in df.iat[i,3]]

def lemmatize_words(df):
  for i in range(df.shape[0]):
    df.iat[i,3] = [lemmatizer.lemmatize(word) for word in df.iat[i,3]]

def get_unique(df):
  unique = []
  for i in range(df.shape[0]):
    for word in df.iat[i,3]:
      if word not in unique:
        unique.append(word)
  return unique

def process_data(df, lemma=False):
  clean_tokens(df)
  remove_stopwords(df)
  if lemma:
    lemmatize_words(df)
  else:
    stem_words(df)

In [4]:
fnc_stem_sample = pd.read_csv('FakeNewsCorpus_Sample.csv', usecols=[*range(2,16)])

clean_tokens(fnc_stem_sample)
unique_clean = get_unique(fnc_stem_sample)

remove_stopwords(fnc_stem_sample)
unique_no_stops = get_unique(fnc_stem_sample)

fnc_lemma_sample = fnc_stem_sample.copy()

stem_words(fnc_stem_sample)
lemmatize_words(fnc_lemma_sample)
unique_stemmed = get_unique(fnc_stem_sample)
unique_lemmatized = get_unique(fnc_lemma_sample)

In [34]:
red_rate_stops = (len(unique_clean) - len(unique_no_stops)) / len(unique_clean) * 100
red_rate_stems = (len(unique_no_stops) - len(unique_stemmed)) / len(unique_no_stops) * 100
red_rate_lemmas = (len(unique_no_stops) - len(unique_lemmatized)) / len(unique_no_stops) * 100

print('Number of unique words in cleaned contents           : {}'.format(len(unique_clean)))
print('Number of unique words after also removing stopwords : {}'.format(len(unique_no_stops)))
print('Number of unique words after also stemming words     : {}'.format(len(unique_stemmed)))
print('Number of unique words after lemmatizing instead     : {}'.format(len(unique_lemmatized)))
print('Reduction rate from removing stopwords    :  {}'.format(red_rate_stops))
print('Further reduction rate from stemming      : {}'.format(red_rate_stems))
print('Alternate reduction rate from lemmatizing : {}'.format(red_rate_lemmas))

Number of unique words in cleaned contents           : 16659
Number of unique words after also removing stopwords : 16527
Number of unique words after also stemming words     : 11004
Number of unique words after lemmatizing instead     : 14618
Reduction rate from removing stopwords    :  0.7923644876643255
Further reduction rate from stemming      : 33.41804320203304
Alternate reduction rate from lemmatizing : 11.550795667695287


In [None]:
# fnc_lemma = pd.read_csv('995,000_rows.csv', usecols=[*range(2,16)])
# rnc_lemma = pd.read_csv('ReliableNewsCorpus.csv', usecols=[*range(1,14)])
# process_data(rnc_lemma, lemma=True)
# process_data(fnc_lemma, lemma=True)
# data = pd.concat(fnc_lemma, rnc_lemma)
# data.to_csv('dataset_lemmatized.csv', index=False)