In [83]:
import re
from cleantext import clean
import matplotlib.pyplot as plt
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

In [84]:
stop_words = set(stopwords.words('english'))

def clean_tokens(df):
  for i in range(df.shape[0]):
    df.iat[i,4] = word_tokenize(clean(re.sub(r'[a-zA-Z]{3,9}\.? ?\d{1,2}(,|.) ?\d{2,4}', 'DATE', df.iat[i,4]), no_urls=True, no_emails=True, no_numbers=True, no_punct=True, replace_with_number='NUM'))

def remove_stopwords(df):
  for i in range(df.shape[0]):
    df.iat[i,4] = [w for w in df.iat[i,4] if w not in stop_words]

def stem_words(df):
  for i in range(df.shape[0]):
    df.iat[i,4] = [stemmer.stem(word) for word in df.iat[i,4]]

def get_unique(df):
  unique = []
  for i in range(df.shape[0]):
    for word in df.iat[i,4]:
      if word not in unique:
        unique.append(word)
  return unique

In [85]:
fnc_sample = pd.read_csv('FakeNewsCorpus_Sample.csv', usecols=[*range(1,15)])

clean_tokens(fnc_sample)
unique_clean = get_unique(fnc_sample)

remove_stopwords(fnc_sample)
unique_no_stops = get_unique(fnc_sample)

stem_words(fnc_sample)
unique_stemmed = get_unique(fnc_sample)

In [86]:
red_rate_stops = (len(unique_clean) - len(unique_no_stops)) / len(unique_clean) * 100
red_rate_stems = (len(unique_no_stops) - len(unique_stemmed)) / len(unique_no_stops) * 100

print('Number of unique words in cleaned contents           : {}'.format(len(unique_clean)))
print('Number of unique words after also removing stopwords : {}'.format(len(unique_no_stops)))
print('Number of unique words after also stemming words     : {}'.format(len(unique_stemmed)))
print('Reduction rate from removing stopwords :  {}'.format(red_rate_stops))
print('Further reduction rate from stemming   : {}'.format(red_rate_stems))

Number of unique words in cleaned contents           : 16659
Number of unique words after also removing stopwords : 16527
Number of unique words after also stemming words     : 11004
Reduction rate from removing stopwords :  0.7923644876643255
Further reduction rate from stemming   : 33.41804320203304


In [87]:
print(fnc_sample.iat[3,4])
print(len(fnc_sample.iat[3,4]))

['rare', 'shark', 'caught', 'scientist', 'left', 'blunder', 'answer', 'shark', 'uniqu', 'featur', 'surviv', 'extrem', 'depth', 'live', 'extend', 'period', 'time', 'shark', 'uniqu', 'trait', 'extend', 'jaw', 'telescop', 'fashion', 'beyond', 'mouth', 'youv', 'ever', 'seen', 'hit', 'scienc', 'fiction', 'movi', 'alien', 'shark', 'monster', 'made', 'manifest', 'earth', 'shark', 'stalk', 'prey', 'lash', 'telescop', 'teeth', 'snatch', 'larg', 'fish', 'swallow', 'one', 'bite', 'dead', 'predat', 'deep', 'sea', 'besid', 'telescop', 'teeth', 'shark', 'glow', 'dark', 'recent', 'discov', 'back', 'num', 'three', 'decad', 'sinc', 'first', 'found', 'select', 'exot', 'shark', 'speci', 'found', 'routin', 'survey', 'dongh', 'township', 'taiwan', 'countri', 'fisheri', 'research', 'institut', 'found', 'five', 'horrifi', 'deepsea', 'shark', 'team', 'notabl', 'horrifi', 'describ', 'obvious', 'featur', 'needleshap', 'teeth', 'like', 'snakelik', 'fang', 'also', 'origin', 'viper', 'shark', 'name', 'that', 'righ

In [88]:

fnc_sample.to_csv('FNC_clean.csv', index=False)

