In [59]:
import pandas as pd

In [77]:
# first lets run clean_text on the 'content' column
from cleantext import clean
def clean_text(s):
    return clean(s,lower=True,                     # lowercase text
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_numbers=True,               # replace all numbers with a special token
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_number="<NUM>",
        lang="en"                   
    )



In [78]:
# split the data in chunks and run in parallel
from joblib import Parallel, delayed
from os import cpu_count

# run in parallel
def run_parallel(df, n_jobs, func):
    # call every element in the chunks in parallel
    results = Parallel(n_jobs=n_jobs)(delayed(func)(element) for element in df)
    return results

In [75]:
# clean the text
def clean_column(df):
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, clean_text)
    # replace column with cleaned text
    return results

In [80]:
from nltk.tokenize import word_tokenize
# tokenize the text. run in parallel
def tokenize_column(df):
    # run the function on the data
    n_jobs = cpu_count()
    results = run_parallel(df['content'], n_jobs, word_tokenize)
    
    return results

In [81]:
from nltk.corpus import stopwords
# removing generic stopwords
def remove_stopwords(df):
    stop_words = set(stopwords.words('english'))

    # remove stopwords from the text
    def r(s):
        return [w for w in s if not w in stop_words]

    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, r)

    return results

In [91]:
# loading a small part of the data for testing
df = pd.read_csv('D:/FakeNews_data/news.csv/news_cleaned_2018_02_13.csv', nrows=10000, index_col=0)

In [82]:
# process data
df['content'] = clean_column(df['content'])
df['content'] = tokenize_column(df)
df['content'] = remove_stopwords(df['content'])


Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,0,2,express.co.uk,rumor,https://www.express.co.uk/news/science/738402/...,"[life, illusion, ,, least, quantum, level, ,, ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Is life an ILLUSION? Researchers prove 'realit...,Sean Martin,,[''],THE UNIVERSE ceases to exist when we are not l...,,,
1,1,6,barenakedislam.com,hate,http://barenakedislam.com/category/donald-trum...,"[unfortunately, ,, n't, yet, attacked, islamic...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",,[''],,,,
2,2,7,barenakedislam.com,hate,http://barenakedislam.com/category/donald-trum...,"[los, angeles, police, department, denied, $, ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",,[''],,,,
3,3,8,barenakedislam.com,hate,http://barenakedislam.com/2017/12/24/more-winn...,"[white, house, decided, quietly, withdraw, tie...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"MORE WINNING! Israeli intelligence source, DEB...","Cleavis Nowell, Cleavisnowell, Clarence J. Fei...",,[''],,,,
4,4,9,barenakedislam.com,hate,http://barenakedislam.com/2017/12/25/oh-trump-...,"[``, time, come, cut, tongues, support, peace,...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"“Oh, Trump, you coward, you just wait, we will...","F.N. Lehner, Don Spilman, Clarence J. Feinour,...",,[''],,,,


In [83]:
from os import path
from os import remove
# create a file to be used for storing the data
def intialize_file(name):
    # check if file exists
    if path.exists(name):
        # if the file exists, delete it
        remove(name)
    # create the file
    with open(name, 'w') as f:
        f.write('') # write an empty string to the file to create it

In [84]:
# append data to csv file
def append_to_file(name, data):
    with open(name, 'a') as f:
        # write the data to the file
        f.write(data)

In [93]:
# create a file to store the data
tokenized_file = 'tokenized_temp.csv'
intialize_file(tokenized_file)

# append header to the file
header = df.columns.values
append_to_file(tokenized_file, ','.join(header))

