In [13]:
from os import cpu_count
from joblib import Parallel, delayed
import swifter
# first lets run clean_text on the 'content' column
from cleantext import clean
def clean_text(s):
    return clean(s,lower=True,                     # lowercase text
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_numbers=True,               # replace all numbers with a special token
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_number="<NUM>",
        lang="en"                   
    )

# clean the text
def clean_column(series):
    # parallelized operation
    return Parallel(n_jobs=cpu_count())(delayed(clean_text)(s) for s in series)

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def tokenize_column(series):
    # parallelized operation
    return Parallel(n_jobs=cpu_count())(delayed(word_tokenize)(s) for s in series)
    
state = False

from nltk.corpus import stopwords
# removing generic stopwords
def remove_stopwords(series):
    stop_words = set(stopwords.words('english'))
    # parallelized operation
    return Parallel(n_jobs=cpu_count())(delayed(lambda x: [w for w in x if not w in stop_words])(s) for s in series)

# lemmatizing the text
from nltk.stem import WordNetLemmatizer
def lemmatize_column(series):
    lemmatizer = WordNetLemmatizer()
    # parallelized operation
    return Parallel(n_jobs=cpu_count())(delayed(lambda x: [lemmatizer.lemmatize(w) for w in x])(s) for s in series)
    

# remove punctiuation
import string
def remove_punctuation(series):
    # parallelized operation
    return Parallel(n_jobs=cpu_count())(delayed(lambda x: [w for w in x if w not in string.punctuation])(s) for s in series)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
#csv file in chunks
import pandas as pd

df = pd.read_csv('fake_news_cleaned.csv', parse_dates=['scraped_at', 'inserted_at', 'updated_at'], chunksize=1000)

In [20]:
for i, chunk in enumerate(df):
    # Drop the unnecessary columns
    chunk = chunk['content']

    # clean the text
    chunk = clean_column(chunk)
    chunk = tokenize_column(chunk)
    chunk = remove_stopwords(chunk)
    chunk = lemmatize_column(chunk)
    chunk = remove_punctuation(chunk)

    # convert list back to series
    chunk = pd.Series(chunk)
    
    # save to file. Append if file exists, otherwise create new file
    if i == 0:
        chunk.to_csv('fake_news_cleaned_tokenized.csv', mode='w')
    else:
        chunk.to_csv('fake_news_cleaned_tokenized.csv', mode='a', header=False)

    # print progress
    print('processed {} rows'.format((i+1)*1000))

processed 1000 rows
processed 2000 rows
processed 3000 rows
processed 4000 rows
processed 5000 rows
processed 6000 rows
processed 7000 rows
processed 8000 rows
processed 9000 rows
processed 10000 rows


KeyboardInterrupt: 