In [1]:
import pandas as pd 
import spacy
from spacy.tokens import Doc, Token, DocBin
from spacy.vocab import Vocab
import numpy as np
import re
import time
from spacy.lang.en import English
from spellchecker import SpellChecker
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
import pickle

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [2]:
def reduce_elongated(text): 
    return re.sub(r'(?i)(.)\1{2,}', r'\1\1', text)

spell = SpellChecker()

In [3]:
fields = ['comment_text', 'toxicity','target']
df_train = pd.read_csv("data/raw_split/train_custom.csv", usecols = fields)
df_train.dropna(inplace = True)
#df_train = df_train[:10]
df_valid = pd.read_csv("data/raw_split/valid_custom.csv", usecols = fields)
df_valid.dropna(inplace = True)
#df_valid = df_valid[:10]
df_test = pd.read_csv("data/raw_split/test_custom.csv", usecols = fields)
df_test.dropna(inplace = True)

In [4]:
dfs = [df_train, df_valid, df_test]
df_names = ["train", "valid", "test"]
suffix = "lemma_nopunctExceptExcl_cleaned_sentencized"
path = "data/pre_processed/"

for i, df in enumerate(dfs):
    print("\nprocessing", df_names[i])
    t1 = time.time()
    processed_comments = []
    df.dropna(subset = ['comment_text'], inplace = True)
    for text in df['comment_text']:
        text = re.sub(r'[-()\"#/@;:<>{}=~|,]'," ", text)
        text = re.sub(r"\n\r","", text)
        combine_whitespaces = re.compile(r"\s+")
        processed_comments.append(reduce_elongated(combine_whitespaces.sub(" ", text).strip()))

    pipe = nlp.pipe(processed_comments, batch_size = 512 ,disable = ["tagger", "parser", "ner"])
    results = []
    for j, doc in enumerate(pipe):
        s = []
        if j % 10000 == 0:
            print("{0:.0%}...".format(j/len(dfs[i])), end='')
        for sent in list(doc.sents):
            s.extend([token.lemma_ for token in sent if not token.is_digit\
                                  and not token.like_url and not token.like_email and not token.like_num])
            s.extend(['.'])            
        results.append(" ".join(s))
    
   
    t2 = time.time()
    print(t2- t1)

    df['comment_text'] = results
    print(df_names[i])
    df.to_csv(path + df_names[i]  + "_" + suffix + ".csv", index = False)


processing train
0%...1%...1%...2%...3%...3%...4%...5%...6%...6%...7%...8%...8%...9%...10%...10%...11%...12%...12%...13%...14%...15%...15%...16%...17%...17%...18%...19%...19%...20%...21%...21%...22%...23%...24%...24%...25%...26%...26%...27%...28%...28%...29%...30%...30%...31%...32%...33%...33%...34%...35%...35%...36%...37%...37%...38%...39%...39%...40%...41%...42%...42%...43%...44%...44%...45%...46%...46%...47%...48%...48%...49%...50%...51%...51%...52%...53%...53%...54%...55%...55%...56%...57%...57%...58%...59%...60%...60%...61%...62%...62%...63%...64%...64%...65%...66%...66%...67%...68%...69%...69%...70%...71%...71%...72%...73%...73%...74%...75%...75%...76%...77%...78%...78%...79%...80%...80%...81%...82%...82%...83%...84%...85%...85%...86%...87%...87%...88%...89%...89%...90%...91%...91%...92%...93%...94%...94%...95%...96%...96%...97%...98%...98%...99%...100%...995.6503140926361
train

processing valid
0%...3%...6%...8%...11%...14%...17%...19%...22%...25%...28%...30%...33%...36%...39%